# set work directory
%pwd
%cd /scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis
%pwd
/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis
'/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis'
# supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')
# project directory
projDir = '/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis/'
# output directory
outDir = projDir + 'output/'
import os
if not os.path.exists(outDir):
os.makedirs(outDir)
# temporary directory
tmpDir = '/scicore/home/schiera/liu0005/tmp/'
import os
if not os.path.exists(tmpDir):
os.makedirs(tmpDir)
# necessary packages
import pickle
import os
import dill
import pandas as pd
import numpy as np
import pyranges as pr
import scanpy as sc
import time
from pycisTopic.cistopic_class import *
from pycisTopic.lda_models import *
from pycisTopic.topic_binarization import *
from pycisTopic.topic_qc import *
from pycisTopic.diff_features import *
from pycistarget.utils import region_names_to_coordinates
from scenicplus.wrappers.run_pycistarget import run_pycistarget
from scenicplus.scenicplus_class import create_SCENICPLUS_object
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.utils import format_egrns
from scenicplus.eregulon_enrichment import *
from scenicplus.plotting.correlation_plot import *
from scenicplus.preprocessing.filtering import apply_std_filtering_to_eRegulons
from scenicplus.RSS import *
from scenicplus.plotting.dotplot import *
from scenicplus.loom import *
tutorial comes from https://pycistopic.readthedocs.io/en/latest/Single_sample_workflow-RTD.html#
# read count matrix of peaks
count_matrix = pd.read_feather(projDir + 'data/atac_data/highToSomite.peaks.counts.feather')
# read peak names and cell names, and use them as row names and colume names for count matrix
peak_names = pd.read_csv(projDir + 'data/atac_data/peak.txt', header=None, sep='\t')
count_matrix.index = peak_names[0]
cell_names = pd.read_csv(projDir + 'data/atac_data/cell.txt', header=None, sep='\t')
count_matrix.columns = cell_names[0]
# check the matrix
count_matrix
# create cisTopic object
cistopic_obj = create_cistopic_object(fragment_matrix=count_matrix)
# add cell information
cell_data = pd.read_csv(projDir+'data/atac_data/cell.metadata.txt', sep='\t')
cistopic_obj.add_cell_data(cell_data)
2022-10-30 14:17:36,470 cisTopic INFO Converting fragment matrix to sparse matrix 2022-10-30 14:22:17,102 cisTopic INFO Creating CistopicObject 2022-10-30 14:22:28,669 cisTopic INFO Done!
# check attributes of cistopic_obj
cistopic_obj_attri = (vars(cistopic_obj))
print(cistopic_obj_attri.keys())
print(cistopic_obj_attri['cell_data'])
dict_keys(['fragment_matrix', 'binary_matrix', 'cell_names', 'region_names', 'cell_data', 'region_data', 'project', 'path_to_fragments', 'selected_model', 'projections'])
# save cistopic_obj
with open(outDir + 'zf_highToSomite_cisTopicObject.pkl', 'wb') as f:
pickle.dump(cistopic_obj, f)
# load cisTopic object
infile = open(outDir + 'zf_highToSomite_cisTopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
cistopic_obj
<pycisTopic.cistopic_class.CistopicObject at 0x2b840ee70f10>
# run models
models=run_cgs_models(cistopic_obj,
n_topics=[400],
n_cpu=40,
n_iter=100,
random_state=555,
alpha=50,
alpha_by_topic=True,
eta=0.1,
eta_by_topic=False,
_temp_dir=tmpDir)
# save
with open(outDir + 'models/zf_highToSomite_models_400.pkl', 'wb') as f:
pickle.dump(models, f)
2022-10-22 14:38:52,950 INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265
(run_cgs_model pid=236156) 2022-10-22 14:39:04,765 cisTopic INFO Running model with 200 topics (run_cgs_model pid=236139) 2022-10-22 14:39:04,767 cisTopic INFO Running model with 90 topics (run_cgs_model pid=236160) 2022-10-22 14:39:04,764 cisTopic INFO Running model with 100 topics
'''
# this Mallet method is not working for some reasons
from pycisTopic.lda_models import *
# configure path Mallet
path_to_mallet_binary = '/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/bin/mallet'
# run models
models=run_cgs_models_mallet(path_to_mallet_binary,
cistopic_obj,
n_topics=[60],
n_cpu=40, # The parallelization is done within each model.
n_iter=100,
random_state=555,
alpha=50,
alpha_by_topic=True,
eta=0.1,
eta_by_topic=False,
tmp_path='/scicore/home/schiera/liu0005/tmp/',
save_path='/scicore/home/schiera/liu0005/tmp/')
# save
with open(outDir + 'models/zf_highToSomite_mallet_models_60.pkl', 'wb') as f:
pickle.dump(models, f)
infile1 = open(outDir + 'models/zf_highToSomite_models_10.pkl', 'rb')
infile2 = open(outDir + 'models/zf_highToSomite_models_20.pkl', 'rb')
infile3 = open(outDir + 'models/zf_highToSomite_models_30_40.pkl', 'rb')
infile4 = open(outDir + 'models/zf_highToSomite_models_50.pkl', 'rb')
infile5 = open(outDir + 'models/zf_highToSomite_models_60_70_80.pkl', 'rb')
infile6 = open(outDir + 'models/zf_highToSomite_models_90_100_200.pkl', 'rb')
infile7 = open(outDir + 'models/zf_highToSomite_models_150.pkl', 'rb')
infile8 = open(outDir + 'models/zf_highToSomite_models_250.pkl', 'rb')
infile9 = open(outDir + 'models/zf_highToSomite_models_300.pkl', 'rb')
infile10 = open(outDir + 'models/zf_highToSomite_models_400.pkl', 'rb')
models1 = pickle.load(infile1)
models2 = pickle.load(infile2)
models3 = pickle.load(infile3)
models4 = pickle.load(infile4)
models5 = pickle.load(infile5)
models6 = pickle.load(infile6)
models7 = pickle.load(infile7)
models8 = pickle.load(infile8)
models9 = pickle.load(infile9)
models10 = pickle.load(infile10)
infile1.close()
infile2.close()
infile3.close()
infile4.close()
infile5.close()
infile6.close()
infile7.close()
infile8.close()
infile9.close()
infile10.close()
models = models1 + models2 + models3 + models4 + models5 + models6 + models7 + models8 + models9 + models10
model=evaluate_models(models,
return_model=True,
metrics=['Arun_2010','Cao_Juan_2009', 'Minmo_2011', 'loglikelihood'],
plot_metrics=False,
save= outDir + 'models/model_selection.pdf')
model.
# add model to cisTopicObject
cistopic_obj.add_LDA_model(model)
# save
with open(outDir + 'zf_highToSomite_cisTopicObject_withTopics.pkl', 'wb') as f:
pickle.dump(cistopic_obj, f)
# load cisTopic object
infile = open(outDir + 'zf_highToSomite_cisTopicObject_withTopics.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
# binarize the topic-region distribution
region_bin_topics_top3k = binarize_topics(cistopic_obj, method='ntop', ntop = 3000)
region_bin_topics_otsu = binarize_topics(cistopic_obj, method='otsu',plot=True, num_columns=5, save= outDir + 'topic_binarization/otsu.pdf')
# we can now binarize the cell-topic distribions.
binarized_cell_topic = binarize_topics(cistopic_obj, target='cell', method='li', plot=True, num_columns=5, nbins=100)
# we can compute the topic quality control metrics
topic_qc_metrics = compute_topic_metrics(cistopic_obj)
topic_qc_metrics
# create a figure dictionary to put all plots together
fig_dict={}
fig_dict['CoherenceVSAssignments']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Log10_Assignments', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['AssignmentsVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Log10_Assignments', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSRegions_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Regions_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSMarginal_dist']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Marginal_topic_dist', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSGini_index']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Gini_index', var_color='Gini_index', plot=False, return_fig=True)
# plot topic stats in one figure
fig=plt.figure(figsize=(40, 43))
i = 1
for fig_ in fig_dict.keys():
plt.subplot(2, 3, i)
img = fig2img(fig_dict[fig_])
plt.imshow(img)
plt.axis('off')
i += 1
plt.subplots_adjust(wspace=0, hspace=-0.70)
fig.savefig(outDir + 'topic_binarization/Topic_qc.pdf', bbox_inches='tight')
plt.show()
topic_annot = topic_annotation(cistopic_obj, annot_var='celltype', binarized_cell_topic=binarized_cell_topic, general_topic_thr = 0.2)
topic_annot
# we can merge the topic metrics and their annotation in a data frame.
topic_qc_metrics = pd.concat([topic_annot[['celltype', 'Ratio_cells_in_topic', 'Ratio_group_in_population']], topic_qc_metrics], axis=1)
topic_qc_metrics.head(1)
| celltype | Ratio_cells_in_topic | Ratio_group_in_population | Log10_Assignments | Assignments | Cells_in_binarized_topic | Regions_in_binarized_topic | Coherence | Marginal_topic_dist | Gini_index | |
|---|---|---|---|---|---|---|---|---|---|---|
| Topic1 | hatching gland(6somite), hatching gland(bud), lens placode(6somite), endoderm(75epi), endothelial progenitors(6somite), cephalic mesoderm(6somite), cephalic mesoderm(bud), prechordal plate(75epi), lateral plate mesoderm(75epi), lateral plate mesoderm(bud), endoderm(6somite), notochord(75epi), ventral diencephalon(6somite), ventral diencephalon(bud), neural floorplate(6somite), endoderm(bud), notochord(bud) | 0.035104 | 0.088066 | 5.996757 | 992561 | 1439 | 2239 | -1.742742 | 0.002014 | 0.889426 |
# save
with open(outDir + 'topic_binarization/Topic_qc_metrics_annot.pkl', 'wb') as f:
pickle.dump(topic_qc_metrics, f)
with open(outDir + 'topic_binarization/binarized_cell_topic.pkl', 'wb') as f:
pickle.dump(binarized_cell_topic, f)
with open(outDir + 'topic_binarization/binarized_topic_region_otsu.pkl', 'wb') as f:
pickle.dump(region_bin_topics_otsu, f)
with open(outDir + 'topic_binarization/binarized_topic_region_top3k.pkl', 'wb') as f:
pickle.dump(region_bin_topics_top3k, f)
# load cisTopic object
infile = open(outDir + 'zf_highToSomite_cisTopicObject_withTopics.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
# impute the region accessibility exploting the cell-topic and topic-region probabilities
imputed_acc_obj = impute_accessibility(cistopic_obj, selected_cells=None, selected_regions=None, scale_factor=10**6)
2022-10-24 19:40:18,644 cisTopic INFO Imputing drop-outs 2022-10-24 19:41:01,055 cisTopic INFO Scaling 2022-10-24 19:41:39,657 cisTopic INFO Keep non zero rows 2022-10-24 19:42:30,574 cisTopic INFO Imputed accessibility sparsity: 0.48989547717527104 2022-10-24 19:42:30,578 cisTopic INFO Create CistopicImputedFeatures object 2022-10-24 19:42:30,579 cisTopic INFO Done!
# save
with open(outDir + 'DARs/imputed_acc_obj.pkl', 'wb') as f:
pickle.dump(imputed_acc_obj, f)
'''
# log-normalize the imputed data
# even for 270g memory we can only process for 25000 cells, so we need split the 40992 cells into two parts
c1 = imputed_acc_obj.mtx[:,0:20000] ## get the first 20'000 cells
from pycisTopic.diff_features import *
scale_factor=10**6 # since we have half million peaks, so scale factor should be in the same magnitude
c1 = normalize(c1, norm="l1", axis=0) # for each column (a cell), each elelment divided by the sum of all elelemnts in this coulmn
c1 *= scale_factor
c1 = np.log1p(c1) ## except 0, the minimal value is 0.8285494910665001, the second minimal is 1.2753598240517567
c1 = np.around(c1,decimals=2)
## we need save c1 and remove c1, otherwise there is no memory for c2
with open(outDir + 'DARs/normalized_imputed_acc_1-20000cells_ndarray.pkl', 'wb') as f:
pickle.dump(c1, f)
del c1
c2 = imputed_acc_obj.mtx[:,20000:40992] ## get the last 20'992 cells
from pycisTopic.diff_features import *
scale_factor=10**6
c2 = normalize(c2, norm="l1", axis=0) # for each column (a cell), each elelment divided by the sum of all elelemnts in this coulmn
c2 *= scale_factor
c2 = np.log1p(c2)
c2 = np.around(c2,decimals=2)
with open(outDir + 'DARs/normalized_imputed_acc_20001-40992cells_ndarray.pkl', 'wb') as f:
pickle.dump(c2, f)
del c2
'''
## change imputed_acc_obj values as normalized ones
# change data type to float32, since the normalized value is in float32, we need use the normalized value replace the current ones
imputed_acc_obj.mtx = imputed_acc_obj.mtx.astype(np.float32)
infile1 = open(outDir + 'DARs/normalized_imputed_acc_1-20000cells_ndarray.pkl', 'rb')
c1 = pickle.load(infile1)
infile1.close()
imputed_acc_obj.mtx[:,0:20000] = c1[:,0:20000]
del c1
infile2 = open(outDir + 'DARs/normalized_imputed_acc_20001-40992cells_ndarray.pkl', 'rb')
c2 = pickle.load(infile2)
infile2.close()
imputed_acc_obj.mtx[:,20000:40992] = c2[:,0:20992]
del c2
'''
# save normalized one
with open(outDir + 'DARs/normalized_imputed_acc_obj.pkl', 'wb') as f:
pickle.dump(imputed_acc_obj, f)
'''
del imputed_acc_obj
'''
infile = open(outDir + 'DARs/normalized_imputed_acc_obj.pkl', 'rb')
normalized_imputed_acc_obj = pickle.load(infile)
infile.close()
'''
# identify highly variable regions
variable_regions = find_highly_variable_features(normalized_imputed_acc_obj,
min_disp = 0.05,
min_mean = 0.0125,
max_mean = 3,
max_disp = np.inf,
n_bins=20,
n_top_features=None,
plot=True,
save= outDir + 'DARs/HVR_plot.pdf')
2022-10-25 12:41:33,475 cisTopic INFO Calculating mean 2022-10-25 12:41:40,040 cisTopic INFO Calculating variance
2022-10-25 12:43:00,037 cisTopic INFO Done!
'''
# save variable_regions
with open(outDir + 'DARs/variable_regions.pkl', 'wb') as f:
pickle.dump(variable_regions, f)
'''
## identify differentially accessible regions
# load imputed_acc object
infile1 = open(outDir + 'DARs/imputed_acc_obj.pkl', 'rb')
imputed_acc_obj = pickle.load(infile1)
infile1.close()
# load varaible regions
import pickle
infile2 = open(outDir + 'DARs/variable_regions.pkl', 'rb')
variable_regions = pickle.load(infile2)
infile2.close()
# we do this part in archr, since there is enough memory even we ask 270g
markers_dict= find_diff_features(cistopic_obj,
imputed_acc_obj,
variable='celltype',
var_features=variable_regions,
contrasts=None,
adjpval_thr=0.05,
log2fc_thr=np.log2(1.5),
n_cpu=1) # there is a memory issue if we set pvalue > 1, each cell type costs 7 min
# even set cup=1, there is still no enough memory after finished runing 8 cell types
DAR = pd.read_csv(outDir + '/DARs/DAR.txt', sep='\t')
cell_types=np.unique(DAR["Contrast"])
markers_dict = {}
for i in range(len(cell_types)):
markers_dict[np.unique(DAR["Contrast"])[i]] = DAR[DAR["Contrast"]==cell_types[i]]
x = [print(x + ': '+ str(len(markers_dict[x]))) for x in markers_dict.keys()]
adaxial cells(6somite): 1059 adaxial cells(75epi): 784 adaxial cells(bud): 2782 anterior neural plate border(bud): 3009 cephalic mesoderm(6somite): 659 cephalic mesoderm(bud): 3683 deep cells(oblong): 2731 dorsal anterior(50epi): 710 dorsal anterior(shield): 1172 dorsal diencephalon(6somite): 1036 dorsal diencephalon(bud): 3642 dorsal ectoderm(50epi): 451 dorsal ectoderm(shield): 853 dorsal posterior(50epi): 777 dorsal posterior(shield): 1534 ectoderm(30epi): 2033 ectoderm(dome): 2274 endoderm(6somite): 201 endoderm(75epi): 1612 endoderm(bud): 2009 endoderm(shield): 1434 epidermis(6somite): 11116 epidermis(75epi): 3028 epidermis(bud): 9070 evl(30epi): 3715 evl(50epi): 21119 evl(6somite): 5082 evl(75epi): 25519 evl(bud): 15179 evl(dome): 1131 evl(shield): 33715 hatching gland(6somite): 918 hatching gland(bud): 2151 heart field(6somite): 2164 high cells: 204 hindbrain(6somite): 3369 hindbrain(bud): 3758 lateral plate mesoderm(75epi): 3551 lateral plate mesoderm(bud): 4571 lens placode(6somite): 361 margin tail(50epi): 3259 margin tail(75epi): 1546 margin tail(shield): 1483 midbrain(6somite): 5509 midbrain(bud): 3256 neural crest(6somite): 2033 neural plate anterior(75epi): 2240 neural plate posterior(75epi): 2880 non-dorsal margin(30epi): 706 non-dorsal margin(dome): 131 notochord(6somite): 713 notochord(75epi): 1277 notochord(bud): 2224 olfactory or adenohypophyseal placode(6somite): 1751 optic primordium(6somite): 9306 optic primordium(bud): 10294 otic placode(6somite): 1244 posterior neural plate border(6somite): 8645 posterior neural plate border(bud): 3804 prechordal plate(75epi): 660 pronephric duct+blood island(6somite): 286 psm(6somite): 3012 psm(75epi): 5030 psm(bud): 8819 somite(6somite): 9405 spinal cord(6somite): 4283 spinal cord(bud): 3317 tailbud(6somite): 1121 tailbud(bud): 3150 telencephalon(6somite): 2134 telencephalon(bud): 1754 ventral diencephalon(6somite): 291 ventral diencephalon(bud): 2267 ventral ectoderm(50epi): 1430 ventral ectoderm(shield): 1048 ventrolateral mesoderm(shield): 6203 ventrolateral mesoendoderm(50epi): 8189 ysl(50epi): 3597 ysl(75epi): 8936 ysl(bud): 11674 ysl(shield): 9470
# save
with open(outDir + 'DARs/DARs.pkl', 'wb') as f:
pickle.dump(markers_dict, f)
tutorial comes from: https://scenicplus.readthedocs.io/en/latest/pbmc_multiome_tutorial.html#Motif-enrichment-analysis-using-pycistarget
region_bin_topics_otsu = pickle.load(open(os.path.join(outDir, 'topic_binarization/binarized_topic_region_otsu.pkl'), 'rb'))
region_bin_topics_top3k = pickle.load(open(os.path.join(outDir, 'topic_binarization/binarized_topic_region_top3k.pkl'), 'rb'))
markers_dict = pickle.load(open(os.path.join(outDir, 'DARs/DARs.pkl'), 'rb'))
# convert to dictionary of pyranges objects.
region_sets = {}
region_sets['topics_otsu'] = {}
region_sets['topics_top_3'] = {}
region_sets['DARs'] = {}
for topic in region_bin_topics_otsu.keys():
regions = region_bin_topics_otsu[topic].index[region_bin_topics_otsu[topic].index.str.startswith('chr')] #only keep regions on known chromosomes
region_sets['topics_otsu'][topic] = pr.PyRanges(region_names_to_coordinates(regions))
for topic in region_bin_topics_top3k.keys():
regions = region_bin_topics_top3k[topic].index[region_bin_topics_top3k[topic].index.str.startswith('chr')] #only keep regions on known chromosomes
region_sets['topics_top_3'][topic] = pr.PyRanges(region_names_to_coordinates(regions))
for DAR in markers_dict.keys():
regions = markers_dict[DAR].index[markers_dict[DAR].index.str.startswith('chr')] #only keep regions on known chromosomes
region_sets['DARs'][DAR] = pr.PyRanges(region_names_to_coordinates(regions))
for key in region_sets.keys():
print(f'{key}: {region_sets[key].keys()}')
topics_otsu: topics_top_3: DARs:
region_sets['topics_otsu']['Topic1']
| Chromosome | Start | End | |
|---|---|---|---|
| 0 | chr1 | 39455761 | 39456261 |
| 1 | chr1 | 45758707 | 45759207 |
| 2 | chr1 | 46493435 | 46493935 |
| 3 | chr1 | 168701 | 169201 |
| 4 | chr1 | 30627428 | 30627928 |
| ... | ... | ... | ... |
| 2234 | chr25 | 22805637 | 22806137 |
| 2235 | chr25 | 3870519 | 3871019 |
| 2236 | chr25 | 31564724 | 31565224 |
| 2237 | chr25 | 2815675 | 2816175 |
| 2238 | chr25 | 18216903 | 18217403 |
2239 rows × 3 columns
# change the data format for rankings database and scores database
rankings_db = os.path.join(projDir, 'data/motif_data/motif.ranks.v2.feather')
scores_db = os.path.join(projDir, 'data/motif_data/motif.scores.v2.feather')
rankings_data = pd.read_feather(rankings_db)
rankings_data2 = rankings_data.iloc[:,0:444653].astype('int32')
rankings_data2.iloc[:,444653:444654] = rankings_data.iloc[:,444653:444654]
rankings_data3 = pd.concat([rankings_data2, rankings_data.iloc[:,444653:444654]], axis=1)
rankings_data3.to_feather ("/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis/data/motif_data/cluster_SCREEN.regions_vs_motifs.rankings.feather")
scores_data = pd.read_feather(scores_db)
scores_data2 = scores_data.iloc[:,0:444653].astype('float32')
scores_data2.iloc[:,444653:444654] = scores_data.iloc[:,444653:444654]
scores_data3 = pd.concat([scores_data2, scores_data.iloc[:,444653:444654]], axis=1)
scores_data3.to_feather ("/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis/data/motif_data/cluster_SCREEN.regions_vs_motifs.scores.v2.feather")
rankings_data
| chr10:10002944-10003444 | chr10:10003889-10004389 | chr10:10004783-10005283 | chr10:10008043-10008543 | chr10:10010515-10011015 | chr10:10013193-10013693 | chr10:10015603-10016103 | chr10:10016141-10016641 | chr10:10017245-10017745 | chr10:10017906-10018406 | ... | chr9:9980361-9980861 | chr9:9982342-9982842 | chr9:99830-100330 | chr9:9986744-9987244 | chr9:9989524-9990024 | chr9:9990177-9990677 | chr9:9992237-9992737 | chr9:9994520-9995020 | chr9:9998473-9998973 | motifs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 210970.0 | 140162.0 | 329619.0 | 139625.0 | 12098.0 | 141625.0 | 252753.0 | 440251.0 | 95493.0 | 309483.0 | ... | 43980.0 | 186246.0 | 339773.0 | 81884.0 | 159885.0 | 111277.0 | 369298.0 | 23704.0 | 258417.0 | AL807792.3 |
| 1 | 210970.0 | 140162.0 | 329619.0 | 139625.0 | 12098.0 | 141625.0 | 252753.0 | 440251.0 | 95493.0 | 309483.0 | ... | 43980.0 | 186246.0 | 339773.0 | 81884.0 | 159885.0 | 111277.0 | 369298.0 | 23704.0 | 258417.0 | atoh1a |
| 2 | 210970.0 | 140162.0 | 329619.0 | 139625.0 | 12098.0 | 141625.0 | 252753.0 | 440251.0 | 95493.0 | 309483.0 | ... | 43980.0 | 186246.0 | 339773.0 | 81884.0 | 159885.0 | 111277.0 | 369298.0 | 23704.0 | 258417.0 | atoh1b |
| 3 | 210970.0 | 140162.0 | 329619.0 | 139625.0 | 12098.0 | 141625.0 | 252753.0 | 440251.0 | 95493.0 | 309483.0 | ... | 43980.0 | 186246.0 | 339773.0 | 81884.0 | 159885.0 | 111277.0 | 369298.0 | 23704.0 | 258417.0 | atoh1c |
| 4 | 210970.0 | 140162.0 | 329619.0 | 139625.0 | 12098.0 | 141625.0 | 252753.0 | 440251.0 | 95493.0 | 309483.0 | ... | 43980.0 | 186246.0 | 339773.0 | 81884.0 | 159885.0 | 111277.0 | 369298.0 | 23704.0 | 258417.0 | bloc1s2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 907 | 116087.0 | 275516.0 | 421974.0 | 166876.0 | 55116.0 | 234540.0 | 233367.0 | 156814.0 | 355398.0 | 384833.0 | ... | 185121.0 | 326672.0 | 390268.0 | 134462.0 | 354930.0 | 97549.0 | 247974.0 | 361489.0 | 5034.0 | znf384l |
| 908 | 72519.0 | 153827.0 | 112587.0 | 152487.0 | 164072.0 | 72289.0 | 427372.0 | 388657.0 | 84688.0 | 397434.0 | ... | 151589.0 | 325407.0 | 271012.0 | 292360.0 | 161830.0 | 130951.0 | 376447.0 | 280961.0 | 377607.0 | znf410 |
| 909 | 340369.0 | 113664.0 | 280685.0 | 409479.0 | 258887.0 | 195843.0 | 364326.0 | 44322.0 | 337690.0 | 187127.0 | ... | 230116.0 | 162260.0 | 406442.0 | 78966.0 | 420317.0 | 301185.0 | 142135.0 | 182187.0 | 433461.0 | znf423 |
| 910 | 382654.0 | 65343.0 | 9933.0 | 137071.0 | 377358.0 | 382655.0 | 160793.0 | 409440.0 | 124034.0 | 148695.0 | ... | 188501.0 | 210257.0 | 320600.0 | 294394.0 | 33965.0 | 443686.0 | 254570.0 | 4557.0 | 428660.0 | znf740a |
| 911 | 382654.0 | 65343.0 | 9933.0 | 137071.0 | 377358.0 | 382655.0 | 160793.0 | 409440.0 | 124034.0 | 148695.0 | ... | 188501.0 | 210257.0 | 320600.0 | 294394.0 | 33965.0 | 443686.0 | 254570.0 | 4557.0 | 428660.0 | znf740b |
912 rows × 444654 columns
# define rankings, score and motif annotation database
rankings_db = os.path.join(projDir, 'data/motif_data/cluster_SCREEN.regions_vs_motifs.rankings.v2.feather')
scores_db = os.path.join(projDir, 'data/motif_data/cluster_SCREEN.regions_vs_motifs.scores.v2.feather')
motif_annotation = os.path.join(projDir, 'data/motif_data/motif.annotation.txt')
# run pycistarget
species = 'danio_rerio'
run_pycistarget(
region_sets = region_sets,
species = species,
biomart_host = 'http://apr2020.archive.ensembl.org/',
save_path = os.path.join(outDir, 'motifs'),
ctx_db_path = rankings_db,
dem_db_path = scores_db,
promoter_space = 1000,
run_without_promoters = True,
path_to_motif_annotations = motif_annotation,
_temp_dir = tmpDir,
annotation_version = 'danio_code_v1'
)
# run pycistarget, ctx_nes_threshold set as 0
species = 'danio_rerio'
run_pycistarget(
region_sets = region_sets,
species = species,
biomart_host = 'http://apr2020.archive.ensembl.org/',
save_path = os.path.join(outDir, 'motifs/motifs_without_auc_filter'),
ctx_db_path = rankings_db,
dem_db_path = scores_db,
ctx_nes_threshold = 0,
promoter_space = 1000,
run_without_promoters = True,
path_to_motif_annotations = motif_annotation,
n_cpu = 1,
_temp_dir = tmpDir,
annotation_version = 'danio_code_v1'
)
# explore the results
menr = dill.load(open(os.path.join(outDir, 'motifs/menr.pkl'), 'rb'))
for key, value in menr.items() :
print (key)
CTX_topics_otsu_All CTX_topics_otsu_No_promoters DEM_topics_otsu_All DEM_topics_otsu_No_promoters CTX_topics_top_3_All CTX_topics_top_3_No_promoters DEM_topics_top_3_All DEM_topics_top_3_No_promoters CTX_DARs_All CTX_DARs_No_promoters DEM_DARs_All DEM_DARs_No_promoters
for k1,v1 in menr['CTX_DARs_No_promoters'].items():
print (k1)
for k2, v2 in menr['CTX_DARs_No_promoters'][k1].cistromes['Database'].items():
print (k2)
# output all tfbs for each cell type specific peaks
df = pd.DataFrame(columns = ['celltype','regulon','enhancer'])
for k1,v1 in menr['CTX_DARs_All'].items():
for k2, v2 in menr['CTX_DARs_All'][k1].cistromes['Database'].items():
for v3 in v2:
df2 = pd.DataFrame({'celltype': k1, 'regulon': k2, 'enhancer': v3},index=[0])
df = df.append(df2)
df.to_csv("output/motifs/CTX_DARs_All_regulons.txt",sep ='\t',index=False,header=True)
# when working with the DEM method instead of the CTX method
df = pd.DataFrame(columns = ['celltype','regulon','enhancer'])
for k1,v1 in menr['DEM_DARs_No_promoters'].cistromes['Database'].items():
for k2, v2 in menr['DEM_DARs_No_promoters'].cistromes['Database'][k1].items():
for v3 in v2:
df2 = pd.DataFrame({'celltype': k1, 'regulon': k2, 'enhancer': v3},index=[0])
df = df.append(df2)
df.to_csv("output/motifs/DEM_DARs_No_promoters_regulons.txt",sep ='\t',index=False,header=True)
# output enriched motifs for each cell type
df = pd.DataFrame(columns = ['Logo','Region_set','Direct_annot','Orthology_annot','NES','AUC','Rank_at_max','Motif_hits'])
for k1,v1 in menr['CTX_DARs_All'].items():
df2 = menr['CTX_DARs_All'][k1].motif_enrichment
df = df.append(df2)
df.to_csv("output/motifs/CTX_DARs_All_regulons_motif_enrichment.txt",sep ='\t',index=False,header=True)
menr['CTX_DARs_All']['adaxial cells(6somite)'].motif_enrichment
| Logo | Region_set | Direct_annot | Orthology_annot | NES | AUC | Rank_at_max | Motif_hits | |
|---|---|---|---|---|---|---|---|---|
| myog | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/myog.png" width="200" > | adaxial cells(6somite) | myog | NaN | 8.139853 | 0.024555 | 21907.0 | 293 |
| tal1 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/tal1.png" width="200" > | adaxial cells(6somite) | tal1 | NaN | 5.777286 | 0.018207 | 21936.0 | 292 |
| figla | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/figla.png" width="200" > | adaxial cells(6somite) | figla | NaN | 5.761635 | 0.018165 | 18587.0 | 182 |
| myod1 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/myod1.png" width="200" > | adaxial cells(6somite) | myod1 | NaN | 5.610823 | 0.017760 | 22208.0 | 388 |
| ttf1.4 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/ttf1.4.png" width="200" > | adaxial cells(6somite) | ttf1.4 | NaN | 5.342396 | 0.017038 | 21895.0 | 229 |
| myf6 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/myf6.png" width="200" > | adaxial cells(6somite) | myf6 | NaN | 5.342396 | 0.017038 | 21895.0 | 229 |
| meis3 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/meis3.png" width="200" > | adaxial cells(6somite) | meis3 | NaN | 4.601930 | 0.015049 | 16295.0 | 157 |
| msc | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/msc.png" width="200" > | adaxial cells(6somite) | msc | NaN | 4.549763 | 0.014909 | 22000.0 | 242 |
| tcf21 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/tcf21.png" width="200" > | adaxial cells(6somite) | tcf21 | NaN | 4.400689 | 0.014508 | 22102.0 | 245 |
| mespab | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespab.png" width="200" > | adaxial cells(6somite) | mespab | NaN | 4.113608 | 0.013737 | 21949.0 | 201 |
| mespba | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespba.png" width="200" > | adaxial cells(6somite) | mespba | NaN | 4.113608 | 0.013737 | 21949.0 | 201 |
| mespbb | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespbb.png" width="200" > | adaxial cells(6somite) | mespbb | NaN | 4.113608 | 0.013737 | 21949.0 | 201 |
| mespaa | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespaa.png" width="200" > | adaxial cells(6somite) | mespaa | NaN | 4.113608 | 0.013737 | 21949.0 | 201 |
| TCF4 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/TCF4.png" width="200" > | adaxial cells(6somite) | TCF4 | NaN | 3.668444 | 0.012540 | 21020.0 | 194 |
| scrt1a | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/scrt1a.png" width="200" > | adaxial cells(6somite) | scrt1a | NaN | 3.597780 | 0.012350 | 10775.0 | 79 |
| scrt2 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/scrt2.png" width="200" > | adaxial cells(6somite) | scrt2 | NaN | 3.597780 | 0.012350 | 10775.0 | 79 |
| scrt1b | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/scrt1b.png" width="200" > | adaxial cells(6somite) | scrt1b | NaN | 3.597780 | 0.012350 | 10775.0 | 79 |
| zic3 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/zic3.png" width="200" > | adaxial cells(6somite) | zic3 | NaN | 3.592721 | 0.012337 | 1264.0 | 18 |
| mef2d | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mef2d.png" width="200" > | adaxial cells(6somite) | mef2d | NaN | 3.287777 | 0.011517 | 829.0 | 13 |
| mef2ab | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mef2ab.png" width="200" > | adaxial cells(6somite) | mef2ab | NaN | 3.270388 | 0.011471 | 829.0 | 13 |
| mef2aa | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mef2aa.png" width="200" > | adaxial cells(6somite) | mef2aa | NaN | 3.270388 | 0.011471 | 829.0 | 13 |
| gli2b | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/gli2b.png" width="200" > | adaxial cells(6somite) | gli2b | NaN | 3.221540 | 0.011340 | 17827.0 | 139 |
| gli1 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/gli1.png" width="200" > | adaxial cells(6somite) | gli1 | NaN | 3.221540 | 0.011340 | 17827.0 | 139 |
| neurod1 | <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/neurod1.png" width="200" > | adaxial cells(6somite) | neurod1 | NaN | 3.014450 | 0.010783 | 22182.0 | 170 |
menr['DEM_DARs_All'].DEM_results('adaxial cells(6somite)')
| Logo | Contrast | Direct_annot | Orthology_annot | Log2FC | Adjusted_pval | Mean_fg | Mean_bg | Motif_hit_thr | Motif_hits | |
|---|---|---|---|---|---|---|---|---|---|---|
| tcf21 | ![]() |
adaxial cells(6somite) | tcf21 | NaN | 2.118818 | 0.0 | 5.305873 | 1.2216 | 3.0 | 682.0 |
| mbd2 | ![]() |
adaxial cells(6somite) | mbd2 | NaN | 2.017327 | 0.000001 | 1.348093 | 0.333 | 3.0 | 239.0 |
| patz1 | ![]() |
adaxial cells(6somite) | patz1 | NaN | 1.667604 | 0.000001 | 1.955043 | 0.6154 | 3.0 | 418.0 |
| hic1 | ![]() |
adaxial cells(6somite) | hic1 | NaN | 1.576642 | 0.000002 | 1.415373 | 0.47452 | 3.0 | 333.0 |
| zbtb14 | ![]() |
adaxial cells(6somite) | zbtb14 | NaN | 1.316687 | 0.000431 | 1.248556 | 0.50124 | 3.0 | 294.0 |
| tcf12 | ![]() |
adaxial cells(6somite) | tcf12 | NaN | 1.175431 | 0.0 | 10.340021 | 4.57806 | 3.0 | 973.0 |
| TCF4 | ![]() |
adaxial cells(6somite) | TCF4 | NaN | 1.055659 | 0.0 | 5.953263 | 2.86398 | 3.0 | 495.0 |
| hey1 | ![]() |
adaxial cells(6somite) | hey1 | NaN | 1.038834 | 0.0 | 2.843275 | 1.38388 | 3.0 | 534.0 |
| heyl | ![]() |
adaxial cells(6somite) | heyl | NaN | 1.038834 | 0.0 | 2.843275 | 1.38388 | 3.0 | 534.0 |
| tcf3a | ![]() |
adaxial cells(6somite) | tcf3a | NaN | 0.912908 | 0.0 | 10.153653 | 5.392742 | 3.0 | 1023.0 |
| tcf3b | ![]() |
adaxial cells(6somite) | tcf3b | NaN | 0.912908 | 0.0 | 10.153653 | 5.392742 | 3.0 | 1023.0 |
| tfap4 | ![]() |
adaxial cells(6somite) | tfap4 | NaN | 0.775537 | 0.0 | 8.948742 | 5.227597 | 3.0 | 998.0 |
| AL807792.3 | ![]() |
adaxial cells(6somite) | AL807792.3 | NaN | 0.735716 | 0.0 | 8.7317 | 5.24356 | 3.0 | 998.0 |
| atoh1a | ![]() |
adaxial cells(6somite) | atoh1a | NaN | 0.735716 | 0.0 | 8.7317 | 5.24356 | 3.0 | 998.0 |
| atoh1b | ![]() |
adaxial cells(6somite) | atoh1b | NaN | 0.735716 | 0.0 | 8.7317 | 5.24356 | 3.0 | 998.0 |
| atoh1c | ![]() |
adaxial cells(6somite) | atoh1c | NaN | 0.735716 | 0.0 | 8.7317 | 5.24356 | 3.0 | 998.0 |
| bloc1s2 | ![]() |
adaxial cells(6somite) | bloc1s2 | NaN | 0.735716 | 0.0 | 8.7317 | 5.24356 | 3.0 | 998.0 |
| klf15 | ![]() |
adaxial cells(6somite) | klf15 | NaN | 0.644429 | 0.0 | 3.397787 | 2.17372 | 3.0 | 525.0 |
| si:ch211-117k10.3 | ![]() |
adaxial cells(6somite) | si:ch211-117k10.3 | NaN | 0.644429 | 0.0 | 3.397787 | 2.17372 | 3.0 | 525.0 |
| msc | ![]() |
adaxial cells(6somite) | msc | NaN | 0.642546 | 0.0 | 9.303607 | 5.959719 | 3.0 | 1014.0 |
| myog | ![]() |
adaxial cells(6somite) | myog | NaN | 0.636861 | 0.0 | 9.977282 | 6.416499 | 3.0 | 1043.0 |
| znf148 | ![]() |
adaxial cells(6somite) | znf148 | NaN | 0.610756 | 0.000001 | 3.114345 | 2.03944 | 3.0 | 501.0 |
| zbtb18 | ![]() |
adaxial cells(6somite) | zbtb18 | NaN | 0.594757 | 0.0 | 5.468046 | 3.620698 | 3.0 | 775.0 |
| zbtb42 | ![]() |
adaxial cells(6somite) | zbtb42 | NaN | 0.594757 | 0.0 | 5.468046 | 3.620698 | 3.0 | 775.0 |
| srebf1 | ![]() |
adaxial cells(6somite) | srebf1 | NaN | 0.583174 | 0.0 | 3.290217 | 2.1962 | 3.0 | 550.0 |
| rreb1a | ![]() |
adaxial cells(6somite) | rreb1a | NaN | 0.538285 | 0.0 | 6.766159 | 4.659101 | 3.0 | 780.0 |
| rreb1b | ![]() |
adaxial cells(6somite) | rreb1b | NaN | 0.538285 | 0.0 | 6.766159 | 4.659101 | 3.0 | 780.0 |
| znf598 | ![]() |
adaxial cells(6somite) | znf598 | NaN | 0.538285 | 0.0 | 6.766159 | 4.659101 | 3.0 | 780.0 |
| CABZ01090890.1 | ![]() |
adaxial cells(6somite) | CABZ01090890.1 | NaN | 0.53278 | 0.000011 | 2.525326 | 1.745559 | 3.0 | 429.0 |
| atf6 | ![]() |
adaxial cells(6somite) | atf6 | NaN | 0.53278 | 0.000011 | 2.525326 | 1.745559 | 3.0 | 429.0 |
| runx3 | ![]() |
adaxial cells(6somite) | runx3 | NaN | 0.522043 | 0.001215 | 2.306195 | 1.606 | 3.0 | 419.0 |
| myf6 | ![]() |
adaxial cells(6somite) | myf6 | NaN | 0.520044 | 0.0 | 9.410133 | 6.56216 | 3.0 | 1039.0 |
| ttf1.4 | ![]() |
adaxial cells(6somite) | ttf1.4 | NaN | 0.520044 | 0.0 | 9.410133 | 6.56216 | 3.0 | 1039.0 |
tutorial comes from: https://scenicplus.readthedocs.io/en/latest/Scenicplus_step_by_step-RTD.html#
# load the data for SCENIC+
adata = sc.read_h5ad(os.path.join(projDir, 'data/rna_data/highTOsomite6.h5ad'))
cistopic_obj = dill.load(open(os.path.join(projDir, 'output/zf_highToSomite_cisTopicObject_withTopics.pkl'), 'rb'))
menr = dill.load(open(os.path.join(projDir, 'output/motifs/menr.pkl'), 'rb'))
# check cell name in scATAC-seq data
cistopic_obj_attri = (vars(cistopic_obj))
print(cistopic_obj_attri.keys())
cistopic_obj_attri['cell_data'].head(3)
dict_keys(['fragment_matrix', 'binary_matrix', 'cell_names', 'region_names', 'cell_data', 'region_data', 'project', 'path_to_fragments', 'selected_model', 'projections'])
| cisTopic_nr_frag | cisTopic_log_nr_frag | cisTopic_nr_acc | cisTopic_log_nr_acc | sample_id | n_genes | total_counts | pct_conuts_mt | celltype | |
|---|---|---|---|---|---|---|---|---|---|
| high_CGTGCTGCATGCTTAG-1___cisTopic | 41625 | 4.619354 | 34712 | 4.54048 | cisTopic | 921 | 1446 | 2.297297 | high cells |
| high_GGGCAATAGTTAACCA-1___cisTopic | 32104 | 4.506559 | 27277 | 4.435797 | cisTopic | 1566 | 2754 | 30.030488 | high cells |
| high_GTACAATGTAGGATCC-1___cisTopic | 32947 | 4.517816 | 28057 | 4.448041 | cisTopic | 1247 | 2085 | 18.427230 | high cells |
# check the cell name in scRNA-seq data
adata.obs.head(3)
| orig.ident | nCount_RNA | nFeature_RNA | percent.mt | nCount_SCT | nFeature_SCT | orig.ident2 | cell.type | |
|---|---|---|---|---|---|---|---|---|
| high_CGTGCTGCATGCTTAG-1 | high | 1446.0 | 921 | 2.297297 | 2921.0 | 1007 | 1-high | high cells |
| high_GGGCAATAGTTAACCA-1 | high | 2754.0 | 1566 | 30.030488 | 3259.0 | 1566 | 1-high | high cells |
| high_GTACAATGTAGGATCC-1 | high | 2085.0 | 1247 | 18.427230 | 3361.0 | 1247 | 1-high | high cells |
# create the SCENIC+ object
scplus_obj = create_SCENICPLUS_object(
GEX_anndata = adata.raw.to_adata(),
cisTopic_obj = cistopic_obj,
menr = menr,
bc_transform_func = lambda x: f'{x}___cisTopic'
)
scplus_obj.X_EXP = np.array(scplus_obj.X_EXP.todense())
scplus_obj
2022-11-07 21:41:11,789 cisTopic INFO Imputing drop-outs 2022-11-07 21:41:50,687 cisTopic INFO Scaling 2022-11-07 21:42:33,348 cisTopic INFO Keep non zero rows 2022-11-07 21:43:24,993 cisTopic INFO Imputed accessibility sparsity: 0.48989547717527104 2022-11-07 21:43:24,994 cisTopic INFO Create CistopicImputedFeatures object 2022-11-07 21:43:24,995 cisTopic INFO Done!
SCENIC+ object with n_cells x n_genes = 40992 x 27406 and n_cells x n_regions = 40992 x 444253 metadata_regions:'Chromosome', 'Start', 'End', 'Width', 'cisTopic_nr_frag', 'cisTopic_log_nr_frag', 'cisTopic_nr_acc', 'cisTopic_log_nr_acc' metadata_genes:'_index' metadata_cell:'GEX_orig.ident', 'GEX_nCount_RNA', 'GEX_nFeature_RNA', 'GEX_percent.mt', 'GEX_nCount_SCT', 'GEX_nFeature_SCT', 'GEX_orig.ident2', 'GEX_cell.type', 'ACC_cisTopic_nr_frag', 'ACC_cisTopic_log_nr_frag', 'ACC_cisTopic_nr_acc', 'ACC_cisTopic_log_nr_acc', 'ACC_sample_id', 'ACC_n_genes', 'ACC_total_counts', 'ACC_pct_conuts_mt', 'ACC_celltype' menr:'CTX_topics_otsu_All', 'CTX_topics_otsu_No_promoters', 'DEM_topics_otsu_All', 'DEM_topics_otsu_No_promoters', 'CTX_topics_top_3_All', 'CTX_topics_top_3_No_promoters', 'DEM_topics_top_3_All', 'DEM_topics_top_3_No_promoters', 'CTX_DARs_All', 'CTX_DARs_No_promoters', 'DEM_DARs_All', 'DEM_DARs_No_promoters' dr_cell:'GEX_X_umap'
# check scplus_obj gene name
scplus_obj.gene_names
scplus_obj.metadata_genes = adata.var.copy(deep=True)
scplus_obj.gene_names
Index(['ptpn12', 'phtf2', 'phtf2.1', 'CU856344.1', 'si:zfos-932h1.3', 'mansc1',
'lrp6', 'dusp16', 'crebl2', 'gpr19',
...
'ENSDARG00000086262', 'ENSDARG00000077638', 'ENSDARG00000086236',
'ENSDARG00000090041', 'ENSDARG00000087469', 'ENSDARG00000059507',
'ENSDARG00000012496', 'ENSDARG00000098374', 'ENSDARG00000116593',
'ENSDARG00000088842'],
dtype='object', length=27406)
# save scplus_obj
with open(outDir + 'zf_highToSomite_scplusObject.pkl', 'wb') as f:
pickle.dump(scplus_obj, f)
# only use variable genes
variable_genes = pd.read_csv(projDir+'data/rna_data/variable_genes.txt', sep='\t')
variable_genes_list = variable_genes['gene name'].astype(str).tolist()
scplus_obj.subset(genes=variable_genes_list)
# save scplus_obj
with open(outDir + 'zf_highToSomite_scplusObject_withFilter.pkl', 'wb') as f:
pickle.dump(scplus_obj, f)
# generate cistromes
start_time = time.time()
merge_cistromes(scplus_obj)
time = time.time()-start_time
print(time/60)
0.6433848579724629
# infer enhancer to gene relationships
get_search_space(scplus_obj,
biomart_host = 'http://apr2020.archive.ensembl.org/',
species = 'drerio',
assembly = 'danRer11',
upstream = [1000, 150000],
downstream = [1000, 150000])
2022-11-08 11:24:36,499 R2G INFO Downloading gene annotation from biomart dataset: drerio_gene_ensembl
2022-11-08 11:25:01,449 R2G INFO Downloading chromosome sizes from: http://hgdownload.cse.ucsc.edu/goldenPath/danRer11/bigZips/danRer11.chrom.sizes
2022-11-08 11:25:04,461 R2G INFO Extending promoter annotation to 10 bp upstream and 10 downstream
2022-11-08 11:25:16,067 R2G INFO Extending search space to:
150000 bp downstream of the end of the gene.
150000 bp upstream of the start of the gene.
2022-11-08 11:25:57,552 R2G INFO Intersecting with regions.
join: Strand data from other will be added as strand data to self. If this is undesired use the flag apply_strand_suffix=False. To turn off the warning set apply_strand_suffix to True or False.
2022-11-08 11:25:59,162 R2G INFO Calculating distances from region to gene 2022-11-08 11:29:46,606 R2G INFO Imploding multiple entries per region and gene 2022-11-08 11:37:13,585 R2G INFO Done!
# infer enhancer to gene relationships
calculate_regions_to_genes_relationships(scplus_obj,
ray_n_cpu = 20,
_temp_dir = tmpDir,
importance_scoring_method = 'GBM',
importance_scoring_kwargs = GBM_KWARGS)
2022-11-08 11:43:03,412 R2G INFO Calculating region to gene importances, using GBM method
2022-11-08 11:43:28,399 INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 initializing: 78%|███████▊ | 4549/5812 [12:59<10:32, 2.00it/s](raylet) Spilled 2235 MiB, 195 objects, write throughput 135 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message. initializing: 79%|███████▉ | 4580/5812 [13:04<03:16, 6.25it/s](raylet) Spilled 4186 MiB, 369 objects, write throughput 203 MiB/s. initializing: 82%|████████▏ | 4745/5812 [13:36<03:01, 5.88it/s](raylet) Spilled 29764 MiB, 2669 objects, write throughput 1034 MiB/s. initializing: 82%|████████▏ | 4756/5812 [13:38<02:47, 6.31it/s](raylet) Spilled 30085 MiB, 2699 objects, write throughput 1026 MiB/s. initializing: 82%|████████▏ | 4763/5812 [13:39<03:00, 5.80it/s](raylet) Spilled 75151 MiB, 6666 objects, write throughput 2469 MiB/s. initializing: 82%|████████▏ | 4769/5812 [13:40<02:47, 6.24it/s](raylet) Spilled 75309 MiB, 6682 objects, write throughput 2446 MiB/s. initializing: 100%|██████████| 5812/5812 [18:05<00:00, 5.36it/s] Running using 20 cores: 100%|██████████| 5812/5812 [33:03<00:00, 2.93it/s]
2022-11-08 12:34:43,328 R2G INFO Took 3099.9146208763123 seconds 2022-11-08 12:34:43,329 R2G INFO Calculating region to gene correlation, using SR method
2022-11-08 12:35:07,131 INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265
initializing: 100%|██████████| 5812/5812 [14:39<00:00, 6.61it/s]
Running using 20 cores: 100%|██████████| 5812/5812 [00:04<00:00, 1172.40it/s]
2022-11-08 12:49:57,251 R2G INFO Took 913.9199676513672 seconds 2022-11-08 12:50:02,485 R2G INFO Done!
# save
with open(outDir + 'zf_highToSomite_scplusObject_withEnhancerToGene.pkl', 'wb') as f:
pickle.dump(scplus_obj, f)
infile = open(outDir + 'zf_highToSomite_scplusObject_withEnhancerToGene.pkl', 'rb')
scplus_obj = pickle.load(infile)
infile.close()
## infer TF to gene relationships
tf_file = os.path.join(projDir, 'data/motif_data/tf.txt')
calculate_TFs_to_genes_relationships(scplus_obj,
tf_file = tf_file,
ray_n_cpu = 20,
method = 'GBM',
_temp_dir = tmpDir,
key= 'TF2G_adj')
2022-11-08 15:10:30,727 INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265
2022-11-08 15:10:32,710 TF2G INFO Calculating TF to gene correlation, using GBM method
initializing: 1%|▏ | 86/6054 [00:22<24:19, 4.09it/s] (raylet) Spilled 4011 MiB, 8 objects, write throughput 780 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message. (raylet) Spilled 7020 MiB, 14 objects, write throughput 1140 MiB/s. (raylet) Spilled 10920 MiB, 20 objects, write throughput 1476 MiB/s. (raylet) Spilled 17997 MiB, 35 objects, write throughput 1704 MiB/s. initializing: 2%|▏ | 103/6054 [01:02<51:58, 1.91it/s] (raylet) Spilled 34157 MiB, 69 objects, write throughput 1703 MiB/s. initializing: 2%|▏ | 113/6054 [01:18<5:16:22, 3.20s/it](raylet) Spilled 99294 MiB, 198 objects, write throughput 2794 MiB/s. initializing: 2%|▏ | 146/6054 [01:33<1:17:59, 1.26it/s](raylet) Spilled 132392 MiB, 264 objects, write throughput 2721 MiB/s. initializing: 5%|▍ | 300/6054 [02:21<23:18, 4.11it/s] (raylet) Spilled 264784 MiB, 528 objects, write throughput 2954 MiB/s. initializing: 10%|▉ | 576/6054 [03:44<17:29, 5.22it/s] (raylet) Spilled 527619 MiB, 1053 objects, write throughput 3042 MiB/s. initializing: 18%|█▊ | 1091/6054 [06:39<37:48, 2.19it/s] (raylet) Spilled 1051113 MiB, 2096 objects, write throughput 3021 MiB/s. initializing: 35%|███▍ | 2107/6054 [12:24<27:33, 2.39it/s] (raylet) Spilled 2097212 MiB, 4182 objects, write throughput 3034 MiB/s. initializing: 69%|██████▉ | 4203/6054 [29:51<08:28, 3.64it/s] (raylet) Spilled 4197433 MiB, 8370 objects, write throughput 2646 MiB/s. initializing: 100%|██████████| 6054/6054 [42:45<00:00, 2.36it/s] Running using 20 cores: 100%|██████████| 6054/6054 [55:02<00:00, 1.83it/s]
2022-11-08 16:48:25,618 TF2G INFO Took 5872.906021118164 seconds 2022-11-08 16:48:25,620 TF2G INFO Adding correlation coefficients to adjacencies. 2022-11-08 16:48:37,842 TF2G INFO Warning: adding TFs as their own target to adjecencies matrix. Importance values will be max + 1e-05 2022-11-08 16:48:42,057 TF2G INFO Adding importance x rho scores to adjacencies. 2022-11-08 16:48:42,076 TF2G INFO Took 16.456542253494263 seconds
# save
with open(outDir + 'zf_highToSomite_scplusObject_withTfToGene.pkl', 'wb') as f:
pickle.dump(scplus_obj, f)
infile = open(outDir + 'zf_highToSomite_scplusObject_withTfToGene.pkl', 'rb')
scplus_obj = pickle.load(infile)
infile.close()
# build eGRNs (it takes 10min)
build_grn(scplus_obj,
min_target_genes = 10,
adj_pval_thr = 1,
min_regions_per_gene = 0,
quantiles = (0.85, 0.90, 0.95),
top_n_regionTogenes_per_gene = (5, 10, 15),
top_n_regionTogenes_per_region = (),
binarize_using_basc = True,
rho_dichotomize_tf2g = True,
rho_dichotomize_r2g = True,
rho_dichotomize_eregulon = True,
rho_threshold = 0.05,
keep_extended_motif_annot = True,
merge_eRegulons = True,
order_regions_to_genes_by = 'importance',
order_TFs_to_genes_by = 'importance',
key_added = 'eRegulons_importance',
cistromes_key = 'Unfiltered',
disable_tqdm = True,
ray_n_cpu = 20,
_temp_dir = tmpDir)
2022-11-08 17:15:29,801 GSEA INFO Thresholding region to gene relationships
2022-11-08 17:15:50,692 INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265
2022-11-08 17:23:24,614 GSEA INFO Subsetting TF2G adjacencies for TF with motif.
2022-11-08 17:23:40,973 INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265
2022-11-08 17:23:42,899 GSEA INFO Running GSEA...
(_ray_run_gsea_for_e_module pid=226126) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide (_ray_run_gsea_for_e_module pid=226126) norm_tag = 1.0/sum_correl_tag (_ray_run_gsea_for_e_module pid=226126) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply (_ray_run_gsea_for_e_module pid=226126) RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis) (_ray_run_gsea_for_e_module pid=226130) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide (_ray_run_gsea_for_e_module pid=226130) norm_tag = 1.0/sum_correl_tag (_ray_run_gsea_for_e_module pid=226130) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply (_ray_run_gsea_for_e_module pid=226130) RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis) (_ray_run_gsea_for_e_module pid=226131) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide (_ray_run_gsea_for_e_module pid=226131) norm_tag = 1.0/sum_correl_tag (_ray_run_gsea_for_e_module pid=226131) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply (_ray_run_gsea_for_e_module pid=226131) RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis) (_ray_run_gsea_for_e_module pid=226113) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide (_ray_run_gsea_for_e_module pid=226113) norm_tag = 1.0/sum_correl_tag (_ray_run_gsea_for_e_module pid=226113) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply (_ray_run_gsea_for_e_module pid=226113) RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis) (_ray_run_gsea_for_e_module pid=226124) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide (_ray_run_gsea_for_e_module pid=226124) norm_tag = 1.0/sum_correl_tag (_ray_run_gsea_for_e_module pid=226124) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply (_ray_run_gsea_for_e_module pid=226124) RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis) (_ray_run_gsea_for_e_module pid=226125) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide (_ray_run_gsea_for_e_module pid=226125) norm_tag = 1.0/sum_correl_tag (_ray_run_gsea_for_e_module pid=226125) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply (_ray_run_gsea_for_e_module pid=226125) RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis) (_ray_run_gsea_for_e_module pid=226128) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:72: RuntimeWarning: divide by zero encountered in divide (_ray_run_gsea_for_e_module pid=226128) norm_no_tag = 1.0/Nmiss (_ray_run_gsea_for_e_module pid=226128) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply (_ray_run_gsea_for_e_module pid=226128) RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
2022-11-08 17:25:36,582 GSEA INFO Subsetting on adjusted pvalue: 1, minimal NES: 0 and minimal leading edge genes 10 2022-11-08 17:25:37,671 GSEA INFO Merging eRegulons 2022-11-08 17:25:38,053 GSEA INFO Storing eRegulons in .uns[eRegulons_importance].
# save
with open(outDir + 'zf_highToSomite_scplusObject_withGRN.pkl', 'wb') as f:
dill.dump(scplus_obj, f)
use this tutorial: https://scenicplus.readthedocs.io/en/latest/Scenicplus_step_by_step-RTD.html#
infile = open(outDir + 'zf_highToSomite_scplusObject_withGRN.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
format_egrns(scplus_obj, eregulons_key = 'eRegulons_importance', TF2G_key = 'TF2G_adj', key_added = 'eRegulon_metadata')
scplus_obj.uns['eRegulon_metadata'][0:10]
| Region_signature_name | Gene_signature_name | TF | is_extended | Region | Gene | R2G_importance | R2G_rho | R2G_importance_x_rho | R2G_importance_x_abs_rho | TF2G_importance | TF2G_regulation | TF2G_rho | TF2G_importance_x_abs_rho | TF2G_importance_x_rho | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr14:14644555-14645055 | znf185 | 0.061208 | 0.295798 | 0.018105 | 0.018105 | 13.435855 | 1 | 0.425698 | 5.719616 | 5.719616 |
| 1 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr14:14520133-14520633 | znf185 | 0.017040 | 0.249921 | 0.004259 | 0.004259 | 13.435855 | 1 | 0.425698 | 5.719616 | 5.719616 |
| 2 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr14:14568116-14568616 | znf185 | 0.042576 | 0.286562 | 0.012201 | 0.012201 | 13.435855 | 1 | 0.425698 | 5.719616 | 5.719616 |
| 3 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr14:14642999-14643499 | znf185 | 0.086022 | 0.311011 | 0.026754 | 0.026754 | 13.435855 | 1 | 0.425698 | 5.719616 | 5.719616 |
| 4 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr14:14642242-14642742 | znf185 | 0.063553 | 0.396462 | 0.025196 | 0.025196 | 13.435855 | 1 | 0.425698 | 5.719616 | 5.719616 |
| 5 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr5:30897269-30897769 | spns3 | 0.042442 | 0.449666 | 0.019085 | 0.019085 | 15.414667 | 1 | 0.388184 | 5.983722 | 5.983722 |
| 6 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr5:31034714-31035214 | spns3 | 0.020183 | 0.196341 | 0.003963 | 0.003963 | 15.414667 | 1 | 0.388184 | 5.983722 | 5.983722 |
| 7 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr5:30992062-30992562 | spns3 | 0.022284 | 0.281541 | 0.006274 | 0.006274 | 15.414667 | 1 | 0.388184 | 5.983722 | 5.983722 |
| 8 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr5:31092057-31092557 | spns3 | 0.034984 | 0.339177 | 0.011866 | 0.011866 | 15.414667 | 1 | 0.388184 | 5.983722 | 5.983722 |
| 9 | TCF4_+_+_(2024r) | TCF4_+_+_(734g) | TCF4 | False | chr5:30965755-30966255 | spns3 | 0.071577 | 0.304456 | 0.021792 | 0.021792 | 15.414667 | 1 | 0.388184 | 5.983722 | 5.983722 |
scplus_obj.uns['eRegulon_metadata'].to_csv(outDir + 'scenicplus/eRegulon_metadata.csv',sep='\t',index=False)
infile = open(outDir + 'zf_highToSomite_scplusObject_withGRN.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
# output Cistromes
df = pd.DataFrame(columns = ['regulon','Chromosome','Start','End'])
for key in scplus_obj.uns['Cistromes']['Unfiltered']:
df2 = scplus_obj.uns['Cistromes']['Unfiltered'][key]
df2 = df2.df # change PyRanges as DataFrame
df3 = pd.DataFrame({'regulon': np.repeat(key,df2.shape[0])}) # generate one data frame
df4 = pd.concat([df3, df2], axis=1) # combine two data frames
df = df.append(df4)
df.to_csv("output/scenicplus/cistromes.txt",sep ='\t',index=False,header=True)
for key in scplus_obj.uns['Cistromes']['Unfiltered']:
print (key)
AL807792.3_(13225r) arnt_(118r) arnt2_(6793r) arntl1a_(623r) arntl1b_(623r) atf1_(9657r) atf2_(5740r) atf3_(2982r) atf4a_(12592r) atf4b_(12592r) atf6_(20590r) atf7a_(4588r) atf7b_(4588r) atoh1a_(13225r) atoh1b_(13225r) atoh1c_(13225r) barhl1a_(66r) barhl1b_(66r) batf2_(2979r) batf3_(2979r) bhlha15_(23643r) bhlhe22_(19794r) bhlhe40_(92r) bhlhe41_(2456r) bloc1s2_(13225r) bsx_(32r) BX548005.1_(3475r) CABZ01066694.1_(1487r) CABZ01079241.1_(2325r) CABZ01090890.1_(20590r) cdx1a_(7r) cdx1b_(7r) cebp1_(227r) cebpb_(4621r) cebpd_(4507r) clocka_(40r) clockb_(40r) CR354582.1_(405r) creb1a_(14230r) creb1b_(14230r) creb3l1_(73r) creb3l2_(228r) creb5a_(17216r) creb5b_(17216r) crema_(14230r) cremb_(14230r) crx_(614r) CT027745.1_(18540r) CT573494.3_(4802r) ctcf_(15916r) cxxc1a_(473r) cxxc1b_(473r) ddit3_(279r) dharma_(3830r) dlx5a_(21r) dmbx1a_(529r) dmbx1b_(529r) drgx_(496r) e2f1_(4744r) e2f2_(20585r) e2f3_(1042r) e2f4_(57592r) e2f5_(8878r) e2f6_(2138r) e2f7_(72025r) e2f8_(5r) e4f1_(7335r) ebf3a-1_(12763r) ebf3a-2_(1227r) ebf3b_(1227r) egr1_(738r) egr2a_(7766r) egr2b_(7766r) egr3_(1400r) ehf_(218r) elf1_(3573r) elf2a_(2069r) elf2b_(2069r) elf3_(183r) elk1_(3240r) elk3_(888r) elk4_(3189r) emx1_(330r) en1b_(193r) en2a_(1770r) en2b_(1770r) eomesa_(1710r) eomesb_(1710r) erf_(1844r) erfl3_(1844r) erg_(1158r) esrrb_(14806r) ets2_(1035r) etv1_(3141r) etv2_(1086r) etv4_(103r) etv5a_(4254r) etv5b_(4254r) etv7_(1043r) evx2_(3429r) fev_(27688r) figla_(1127r) fli1a_(27688r) fli1b_(256r) fosaa_(4630r) fosab_(4630r) fosl1a_(4630r) fosl1b_(7484r) foxa_(1993r) foxa1_(2248r) foxa2_(1212r) foxd1_(189r) foxd3_(884r) foxd5_(8r) foxd7_(189r) foxg1a_(10915r) foxg1b_(645r) foxg1c_(10915r) foxg1d_(10915r) foxn1_(31761r) foxo1a_(875r) foxo1b_(875r) foxo3b_(46855r) foxo6a_(46855r) foxo6b_(46855r) foxp2_(613r) foxq2_(77r) gabpa_(41829r) gata1a_(365r) gata1b_(24739r) gata2a_(24739r) gata2b_(365r) gata3_(131r) gata4_(5577r) gata5_(448r) gata6_(2450r) gatad2ab_(2498r) gbx1_(270r) gcm2_(37r) gli1_(787r) gli2a_(177r) gli2b_(787r) gli3_(177r) glis1a_(1763r) glis1b_(1763r) glis2a_(1961r) glis2b_(1961r) glis3_(3425r) gmeb1_(15r) gmeb2_(22r) grhl1_(32767r) gsc_(3471r) gsx1_(17r) her1_(59r) her11_(59r) her12_(51r) her15.1_(51r) her15.2_(51r) her2_(51r) her4.1_(51r) her4.2_(51r) her4.3_(51r) her4.4_(51r) her5_(59r) her6_(11807r) her8.2_(11807r) her9_(11807r) hey1_(90794r) hey2_(54r) heyl_(90794r) hic1_(71114r) hinfp_(59436r) hmx4_(140r) hnf1a_(50r) hnf1ba_(600r) hnf1bb_(600r) hnf4a_(4278r) hnf4b_(4278r) hnf4g_(1529r) homeza_(8r) homezb_(8r) hoxa11a_(309r) hoxa13a_(72r) hoxa13b_(72r) hoxa1a_(367r) hoxa2b_(840r) hoxa4a_(230r) hoxa9a_(1829r) hoxa9b_(1829r) hoxb1a_(849r) hoxb1b-1_(367r) hoxb1b-2_(849r) hoxb3a_(68r) hoxb4a_(1982r) hoxb6a_(1201r) hoxb6b_(1201r) hoxb8a_(781r) hoxb8b_(781r) hoxc12a_(3704r) hoxc12b_(3704r) hoxc13a_(669r) hoxc1a_(367r) hoxc3a_(230r) hoxc9a_(1783r) hoxd12a_(2821r) hsf1_(341r) hsf4_(341r) id4_(1024r) ikzf1_(76r) insm1a_(2025r) insm1b_(2025r) irf10_(25526r) irf3_(25526r) irf5_(856r) irf8_(25526r) irf9_(25526r) jdp2a_(7484r) jdp2b_(4823r) jun_(2498r) junba_(3488r) junbb_(3488r) june_(34394r) klf12a_(51519r) klf13_(2745r) klf15_(46788r) klf3_(717r) klf4_(13371r) klf5l_(7281r) klf6a_(7281r) klf6b_(7281r) klf7a_(449r) klf7b_(449r) klf8_(717r) lef1_(33r) lhx6_(32r) lhx6-1_(90r) lhx8a_(7620r) lhx8b_(7620r) mafa_(2327r) mafb_(239r) mafbb_(2327r) maff_(58r) mafga_(58r) mafgb_(58r) max_(2390r) maza_(54030r) mazb_(54030r) mbd1a_(473r) mbd1b_(473r) mbd2_(48347r) mecom_(123r) mecp2_(23406r) mef2aa_(13r) mef2ab_(13r) mef2b_(44011r) mef2d_(13r) meis3_(1224r) meox1_(154r) meox2a_(30r) mespaa_(842r) mespab_(842r) mespba_(842r) mespbb_(842r) mgaa_(4160r) mlx_(56123r) mlxipl_(392r) mnta_(46r) mntb_(46r) msc_(6406r) msx1a_(32r) msx1b_(16r) msx2b_(55r) msx3_(16r) mta3_(7392r) mtf1_(34911r) mxi1_(872r) myb_(128r) myca_(838r) mycb_(6095r) mych_(838r) mycn_(909r) myf6_(2771r) myod1_(2422r) myog_(7268r) nanog_(32r) neurod1_(7955r) neurod2_(543r) nfatc2a_(33r) nfe2l3_(2183r) nfia_(13r) nfkb1_(206r) nfya_(4123r) nfyal_(4713r) nkx1.2lb_(965r) nkx2.1_(18r) npas2_(40r) nr1h4_(195r) nr1i2_(51r) nr2c1_(3254r) nr2c2_(15000r) nr2f1a_(3062r) nr2f1b_(177r) nr2f2_(4755r) nr2f5_(177r) nr2f6a_(4449r) nr2f6b_(4449r) nr4a3_(2516r) nr6a1b_(9206r) nrl_(239r) olig1_(11310r) onecut1_(11608r) onecutl_(11608r) otx1_(6329r) otx2a_(6842r) otx2b_(1310r) otx5_(614r) patz1_(68153r) pax1a_(428r) pax1b_(428r) pax2a_(395r) pax2b_(71r) pax5_(395r) pax8_(6r) pax9_(688r) pbx1a_(34r) pbx1b_(34r) pbx2_(34r) pbx3a_(34r) pbx3b_(34r) pdx1_(1502r) pgm3_(8r) phox2ba_(39r) pitx1_(19r) pitx2_(508r) pitx3_(19r) pknox2_(2214r) plag1_(2115r) plagx_(2115r) pou2f2a_(8394r) pou2f3_(440r) pou3f1_(753r) pou4f3_(15r) pparaa_(2903r) pparda_(2903r) ppardb_(2903r) prdm14_(34r) prdm2a_(24571r) prdm4_(569r) prox1a_(22297r) prox2_(22297r) prox3_(22297r) prrx1a_(695r) prrx1b_(695r) raraa_(29r) rarab_(29r) rarga_(179r) rargb_(74r) rcor1_(1142r) rest_(3189r) rfx1a_(1216r) rfx1b_(292r) rfx2_(1444r) rfx3_(6463r) rfx4_(1444r) rfx7a_(86r) rfx7b_(86r) rlf_(123r) rreb1a_(13284r) rreb1b_(13284r) rsl1d1_(6559r) runx1_(3r) runx3_(3226r) rxraa_(4619r) rxrab_(4619r) rxrba_(4619r) rxrbb_(4114r) rxrgb_(4619r) scrt1a_(956r) scrt1b_(956r) scrt2_(956r) si:ch211-117k10.3_(46788r) si:ch211-153j24.3_(2598r) si:ch211-51e8.2_(12763r) si:ch211-69l10.4_(17177r) si:ch73-59f11.3_(2390r) si:dkey-149i17.7_(3549r) si:dkey-229b18.3_(302r) si:dkey-56e3.3_(26r) si:dkeyp-79b7.12_(7335r) si:rp71-45k5.2_(645r) sinhcafl_(123r) six1a_(213r) six2a_(213r) six2b_(213r) six3a_(676r) six3b_(676r) six4a_(13r) six4b_(13r) six5_(213r) six7_(676r) six9_(213r) smad1_(81r) smad10a_(19r) smad2_(4r) smad3a_(9r) smad3b_(9r) smad4a_(19r) smarcc1a_(2r) smarcc1b_(2r) smarcc2_(3475r) snai1a_(2769r) snai1b_(1435r) snai2_(302r) snai3_(1435r) sox10_(221r) sox12_(19r) sox19a_(812r) sox19b_(812r) sox2_(795r) sox3_(812r) sox6_(258r) sox8a_(48r) sox8b_(48r) sox9a_(2967r) sox9b_(2967r) sp1_(46071r) sp2_(6197r) sp3a_(6559r) sp3b_(51613r) sp4_(7887r) sp8a_(4847r) sp8b_(4847r) SPDEF_(19958r) spi1a_(56r) spi1b_(56r) srebf1_(27928r) srebf2_(178r) srfa_(58r) srfb_(58r) stat5a_(5314r) stat5b_(5314r) stat6_(14034r) tal1_(2529r) tbr1a_(6665r) tbr1b_(6665r) tbx1_(16675r) tbx15_(17177r) tbx16_(4200r) tbx16l_(4200r) tbx19_(18540r) tbx20_(1375r) tbx21_(6665r) tbx2a_(4200r) tbx2b_(4200r) tbx4_(4112r) tbx5a_(2319r) tbx5b_(2319r) tbxta_(15246r) tbxtb_(15246r) tcf12_(14837r) tcf21_(22386r) tcf3a_(12259r) tcf3b_(12259r) TCF4_(16691r) tcf7_(568r) tcf7l1a_(1838r) tcf7l1b_(1838r) tcf7l2_(1838r) tead1a_(4802r) tead1b_(4422r) tead3a_(10088r) tead3b_(10088r) tefa_(4047r) tefb_(4047r) tfap2a_(26430r) tfap2b_(2871r) tfap2c_(26430r) tfap2d_(61543r) tfap2e_(31698r) tfap4_(10094r) tfcp2_(4835r) tfcp2l1_(4835r) tfdp1a_(66653r) tfdp1b_(66653r) tfeb_(1725r) tlx1_(2930r) tp53_(4522r) tp73_(445r) ttf1.4_(2771r) twist1a_(52r) twist1b_(52r) twist3_(52r) ubp1_(4835r) usf1_(216r) usf1l_(19340r) usf2_(2064r) VDR_(51r) vdrb_(298r) ved_(7r) wt1a_(19807r) wt1b_(19807r) xbp1_(21r) ybx1_(565r) yy1a_(115r) yy1b_(115r) zbtb14_(66514r) zbtb18_(12835r) zbtb3_(24r) zbtb42_(12835r) zbtb47b_(20798r) zbtb49_(31r) zbtb7a_(56r) zbtb7b_(5948r) zbtb7c_(56r) zeb1a_(349r) zeb1b_(349r) zfx_(2325r) zgc:112083_(21136r) zgc:113424_(10915r) zgc:153115_(2745r) zic1_(3003r) zic2a_(4655r) zic2b_(4655r) zic3_(241r) zic4_(32032r) zic6_(32032r) znf143b_(29475r) znf148_(39387r) znf423_(5715r) znf598_(13284r) znf652_(20798r) znf711_(2325r) znf740a_(34273r) znf740b_(34273r) AL807792.3_extended_(13225r) arnt2_extended_(6793r) arnt_extended_(118r) arntl1a_extended_(623r) arntl1b_extended_(623r) atf1_extended_(9657r) atf2_extended_(5740r) atf3_extended_(2982r) atf4a_extended_(12592r) atf4b_extended_(12592r) atf6_extended_(20590r) atf7a_extended_(4588r) atf7b_extended_(4588r) atoh1a_extended_(13225r) atoh1b_extended_(13225r) atoh1c_extended_(13225r) barhl1a_extended_(66r) barhl1b_extended_(66r) batf2_extended_(2979r) batf3_extended_(2979r) bhlha15_extended_(23643r) bhlhe22_extended_(19794r) bhlhe40_extended_(92r) bhlhe41_extended_(2456r) bloc1s2_extended_(13225r) bsx_extended_(32r) BX548005.1_extended_(3475r) CABZ01066694.1_extended_(1487r) CABZ01079241.1_extended_(2325r) CABZ01090890.1_extended_(20590r) cdx1a_extended_(7r) cdx1b_extended_(7r) cebp1_extended_(227r) cebpb_extended_(4621r) cebpd_extended_(4507r) clocka_extended_(40r) clockb_extended_(40r) CR354582.1_extended_(405r) creb1a_extended_(14230r) creb1b_extended_(14230r) creb3l1_extended_(73r) creb3l2_extended_(228r) creb5a_extended_(17216r) creb5b_extended_(17216r) crema_extended_(14230r) cremb_extended_(14230r) crx_extended_(614r) CT027745.1_extended_(18540r) CT573494.3_extended_(4802r) ctcf_extended_(15916r) cxxc1a_extended_(473r) cxxc1b_extended_(473r) ddit3_extended_(279r) dharma_extended_(3830r) dlx5a_extended_(21r) dmbx1a_extended_(529r) dmbx1b_extended_(529r) drgx_extended_(496r) e2f1_extended_(4744r) e2f2_extended_(20585r) e2f3_extended_(1042r) e2f4_extended_(57592r) e2f5_extended_(8878r) e2f6_extended_(2138r) e2f7_extended_(72025r) e2f8_extended_(5r) e4f1_extended_(7335r) ebf3a-1_extended_(12763r) ebf3a-2_extended_(1227r) ebf3b_extended_(1227r) egr1_extended_(738r) egr2a_extended_(7766r) egr2b_extended_(7766r) egr3_extended_(1400r) ehf_extended_(218r) elf1_extended_(3573r) elf2a_extended_(2069r) elf2b_extended_(2069r) elf3_extended_(183r) elk1_extended_(3240r) elk3_extended_(888r) elk4_extended_(3189r) emx1_extended_(330r) en1b_extended_(193r) en2a_extended_(1770r) en2b_extended_(1770r) eomesa_extended_(1710r) eomesb_extended_(1710r) erf_extended_(1844r) erfl3_extended_(1844r) erg_extended_(1158r) esrrb_extended_(14806r) ets2_extended_(1035r) etv1_extended_(3141r) etv2_extended_(1086r) etv4_extended_(103r) etv5a_extended_(4254r) etv5b_extended_(4254r) etv7_extended_(1043r) evx2_extended_(3429r) fev_extended_(27688r) figla_extended_(1127r) fli1a_extended_(27688r) fli1b_extended_(256r) fosaa_extended_(4630r) fosab_extended_(4630r) fosl1a_extended_(4630r) fosl1b_extended_(7484r) foxa1_extended_(2248r) foxa2_extended_(1212r) foxa_extended_(1993r) foxd1_extended_(189r) foxd3_extended_(884r) foxd5_extended_(8r) foxd7_extended_(189r) foxg1a_extended_(10915r) foxg1b_extended_(645r) foxg1c_extended_(10915r) foxg1d_extended_(10915r) foxn1_extended_(31761r) foxo1a_extended_(875r) foxo1b_extended_(875r) foxo3b_extended_(46855r) foxo6a_extended_(46855r) foxo6b_extended_(46855r) foxp2_extended_(613r) foxq2_extended_(77r) gabpa_extended_(41829r) gata1a_extended_(365r) gata1b_extended_(24739r) gata2a_extended_(24739r) gata2b_extended_(365r) gata3_extended_(131r) gata4_extended_(5577r) gata5_extended_(448r) gata6_extended_(2450r) gatad2ab_extended_(2498r) gbx1_extended_(270r) gcm2_extended_(37r) gli1_extended_(787r) gli2a_extended_(177r) gli2b_extended_(787r) gli3_extended_(177r) glis1a_extended_(1763r) glis1b_extended_(1763r) glis2a_extended_(1961r) glis2b_extended_(1961r) glis3_extended_(3425r) gmeb1_extended_(15r) gmeb2_extended_(22r) grhl1_extended_(32767r) gsc_extended_(3471r) gsx1_extended_(17r) her11_extended_(59r) her12_extended_(51r) her15.1_extended_(51r) her15.2_extended_(51r) her1_extended_(59r) her2_extended_(51r) her4.1_extended_(51r) her4.2_extended_(51r) her4.3_extended_(51r) her4.4_extended_(51r) her5_extended_(59r) her6_extended_(11807r) her8.2_extended_(11807r) her9_extended_(11807r) hey1_extended_(90794r) hey2_extended_(54r) heyl_extended_(90794r) hic1_extended_(71114r) hinfp_extended_(59436r) hmx4_extended_(140r) hnf1a_extended_(50r) hnf1ba_extended_(600r) hnf1bb_extended_(600r) hnf4a_extended_(4278r) hnf4b_extended_(4278r) hnf4g_extended_(1529r) homeza_extended_(8r) homezb_extended_(8r) hoxa11a_extended_(309r) hoxa13a_extended_(72r) hoxa13b_extended_(72r) hoxa1a_extended_(367r) hoxa2b_extended_(840r) hoxa4a_extended_(230r) hoxa9a_extended_(1829r) hoxa9b_extended_(1829r) hoxb1a_extended_(849r) hoxb1b-1_extended_(367r) hoxb1b-2_extended_(849r) hoxb3a_extended_(68r) hoxb4a_extended_(1982r) hoxb6a_extended_(1201r) hoxb6b_extended_(1201r) hoxb8a_extended_(781r) hoxb8b_extended_(781r) hoxc12a_extended_(3704r) hoxc12b_extended_(3704r) hoxc13a_extended_(669r) hoxc1a_extended_(367r) hoxc3a_extended_(230r) hoxc9a_extended_(1783r) hoxd12a_extended_(2821r) hsf1_extended_(341r) hsf4_extended_(341r) id4_extended_(1024r) ikzf1_extended_(76r) insm1a_extended_(2025r) insm1b_extended_(2025r) irf10_extended_(25526r) irf3_extended_(25526r) irf5_extended_(856r) irf8_extended_(25526r) irf9_extended_(25526r) jdp2a_extended_(7484r) jdp2b_extended_(4823r) jun_extended_(2498r) junba_extended_(3488r) junbb_extended_(3488r) june_extended_(34394r) klf12a_extended_(51519r) klf13_extended_(2745r) klf15_extended_(46788r) klf3_extended_(717r) klf4_extended_(13371r) klf5l_extended_(7281r) klf6a_extended_(7281r) klf6b_extended_(7281r) klf7a_extended_(449r) klf7b_extended_(449r) klf8_extended_(717r) lef1_extended_(33r) lhx6-1_extended_(90r) lhx6_extended_(32r) lhx8a_extended_(7620r) lhx8b_extended_(7620r) mafa_extended_(2327r) mafb_extended_(239r) mafbb_extended_(2327r) maff_extended_(58r) mafga_extended_(58r) mafgb_extended_(58r) max_extended_(2390r) maza_extended_(54030r) mazb_extended_(54030r) mbd1a_extended_(473r) mbd1b_extended_(473r) mbd2_extended_(48347r) mecom_extended_(123r) mecp2_extended_(23406r) mef2aa_extended_(13r) mef2ab_extended_(13r) mef2b_extended_(44011r) mef2d_extended_(13r) meis3_extended_(1224r) meox1_extended_(154r) meox2a_extended_(30r) mespaa_extended_(842r) mespab_extended_(842r) mespba_extended_(842r) mespbb_extended_(842r) mgaa_extended_(4160r) mlx_extended_(56123r) mlxipl_extended_(392r) mnta_extended_(46r) mntb_extended_(46r) msc_extended_(6406r) msx1a_extended_(32r) msx1b_extended_(16r) msx2b_extended_(55r) msx3_extended_(16r) mta3_extended_(7392r) mtf1_extended_(34911r) mxi1_extended_(872r) myb_extended_(128r) myca_extended_(838r) mycb_extended_(6095r) mych_extended_(838r) mycn_extended_(909r) myf6_extended_(2771r) myod1_extended_(2422r) myog_extended_(7268r) nanog_extended_(32r) neurod1_extended_(7955r) neurod2_extended_(543r) nfatc2a_extended_(33r) nfe2l3_extended_(2183r) nfia_extended_(13r) nfkb1_extended_(206r) nfya_extended_(4123r) nfyal_extended_(4713r) nkx1.2lb_extended_(965r) nkx2.1_extended_(18r) npas2_extended_(40r) nr1h4_extended_(195r) nr1i2_extended_(51r) nr2c1_extended_(3254r) nr2c2_extended_(15000r) nr2f1a_extended_(3062r) nr2f1b_extended_(177r) nr2f2_extended_(4755r) nr2f5_extended_(177r) nr2f6a_extended_(4449r) nr2f6b_extended_(4449r) nr4a3_extended_(2516r) nr6a1b_extended_(9206r) nrl_extended_(239r) olig1_extended_(11310r) onecut1_extended_(11608r) onecutl_extended_(11608r) otx1_extended_(6329r) otx2a_extended_(6842r) otx2b_extended_(1310r) otx5_extended_(614r) patz1_extended_(68153r) pax1a_extended_(428r) pax1b_extended_(428r) pax2a_extended_(395r) pax2b_extended_(71r) pax5_extended_(395r) pax8_extended_(6r) pax9_extended_(688r) pbx1a_extended_(34r) pbx1b_extended_(34r) pbx2_extended_(34r) pbx3a_extended_(34r) pbx3b_extended_(34r) pdx1_extended_(1502r) pgm3_extended_(8r) phox2ba_extended_(39r) pitx1_extended_(19r) pitx2_extended_(508r) pitx3_extended_(19r) pknox2_extended_(2214r) plag1_extended_(2115r) plagx_extended_(2115r) pou2f2a_extended_(8394r) pou2f3_extended_(440r) pou3f1_extended_(753r) pou4f3_extended_(15r) pparaa_extended_(2903r) pparda_extended_(2903r) ppardb_extended_(2903r) prdm14_extended_(34r) prdm2a_extended_(24571r) prdm4_extended_(569r) prox1a_extended_(22297r) prox2_extended_(22297r) prox3_extended_(22297r) prrx1a_extended_(695r) prrx1b_extended_(695r) raraa_extended_(29r) rarab_extended_(29r) rarga_extended_(179r) rargb_extended_(74r) rcor1_extended_(1142r) rest_extended_(3189r) rfx1a_extended_(1216r) rfx1b_extended_(292r) rfx2_extended_(1444r) rfx3_extended_(6463r) rfx4_extended_(1444r) rfx7a_extended_(86r) rfx7b_extended_(86r) rlf_extended_(123r) rreb1a_extended_(13284r) rreb1b_extended_(13284r) rsl1d1_extended_(6559r) runx1_extended_(3r) runx3_extended_(3226r) rxraa_extended_(4619r) rxrab_extended_(4619r) rxrba_extended_(4619r) rxrbb_extended_(4114r) rxrgb_extended_(4619r) scrt1a_extended_(956r) scrt1b_extended_(956r) scrt2_extended_(956r) si:ch211-117k10.3_extended_(46788r) si:ch211-153j24.3_extended_(2598r) si:ch211-51e8.2_extended_(12763r) si:ch211-69l10.4_extended_(17177r) si:ch73-59f11.3_extended_(2390r) si:dkey-149i17.7_extended_(3549r) si:dkey-229b18.3_extended_(302r) si:dkey-56e3.3_extended_(26r) si:dkeyp-79b7.12_extended_(7335r) si:rp71-45k5.2_extended_(645r) sinhcafl_extended_(123r) six1a_extended_(213r) six2a_extended_(213r) six2b_extended_(213r) six3a_extended_(676r) six3b_extended_(676r) six4a_extended_(13r) six4b_extended_(13r) six5_extended_(213r) six7_extended_(676r) six9_extended_(213r) smad10a_extended_(19r) smad1_extended_(81r) smad2_extended_(4r) smad3a_extended_(9r) smad3b_extended_(9r) smad4a_extended_(19r) smarcc1a_extended_(2r) smarcc1b_extended_(2r) smarcc2_extended_(3475r) snai1a_extended_(2769r) snai1b_extended_(1435r) snai2_extended_(302r) snai3_extended_(1435r) sox10_extended_(221r) sox12_extended_(19r) sox19a_extended_(812r) sox19b_extended_(812r) sox2_extended_(795r) sox3_extended_(812r) sox6_extended_(258r) sox8a_extended_(48r) sox8b_extended_(48r) sox9a_extended_(2967r) sox9b_extended_(2967r) sp1_extended_(46071r) sp2_extended_(6197r) sp3a_extended_(6559r) sp3b_extended_(51613r) sp4_extended_(7887r) sp8a_extended_(4847r) sp8b_extended_(4847r) SPDEF_extended_(19958r) spi1a_extended_(56r) spi1b_extended_(56r) srebf1_extended_(27928r) srebf2_extended_(178r) srfa_extended_(58r) srfb_extended_(58r) stat5a_extended_(5314r) stat5b_extended_(5314r) stat6_extended_(14034r) tal1_extended_(2529r) tbr1a_extended_(6665r) tbr1b_extended_(6665r) tbx15_extended_(17177r) tbx16_extended_(4200r) tbx16l_extended_(4200r) tbx19_extended_(18540r) tbx1_extended_(16675r) tbx20_extended_(1375r) tbx21_extended_(6665r) tbx2a_extended_(4200r) tbx2b_extended_(4200r) tbx4_extended_(4112r) tbx5a_extended_(2319r) tbx5b_extended_(2319r) tbxta_extended_(15246r) tbxtb_extended_(15246r) tcf12_extended_(14837r) tcf21_extended_(22386r) tcf3a_extended_(12259r) tcf3b_extended_(12259r) TCF4_extended_(16691r) tcf7_extended_(568r) tcf7l1a_extended_(1838r) tcf7l1b_extended_(1838r) tcf7l2_extended_(1838r) tead1a_extended_(4802r) tead1b_extended_(4422r) tead3a_extended_(10088r) tead3b_extended_(10088r) tefa_extended_(4047r) tefb_extended_(4047r) tfap2a_extended_(26430r) tfap2b_extended_(2871r) tfap2c_extended_(26430r) tfap2d_extended_(61543r) tfap2e_extended_(31698r) tfap4_extended_(10094r) tfcp2_extended_(4835r) tfcp2l1_extended_(4835r) tfdp1a_extended_(66653r) tfdp1b_extended_(66653r) tfeb_extended_(1725r) tlx1_extended_(2930r) tp53_extended_(4522r) tp73_extended_(445r) ttf1.4_extended_(2771r) twist1a_extended_(52r) twist1b_extended_(52r) twist3_extended_(52r) ubp1_extended_(4835r) usf1_extended_(216r) usf1l_extended_(19340r) usf2_extended_(2064r) VDR_extended_(51r) vdrb_extended_(298r) ved_extended_(7r) wt1a_extended_(19807r) wt1b_extended_(19807r) xbp1_extended_(21r) ybx1_extended_(565r) yy1a_extended_(115r) yy1b_extended_(115r) zbtb14_extended_(66514r) zbtb18_extended_(12835r) zbtb3_extended_(24r) zbtb42_extended_(12835r) zbtb47b_extended_(20798r) zbtb49_extended_(31r) zbtb7a_extended_(56r) zbtb7b_extended_(5948r) zbtb7c_extended_(56r) zeb1a_extended_(349r) zeb1b_extended_(349r) zfx_extended_(2325r) zgc:112083_extended_(21136r) zgc:113424_extended_(10915r) zgc:153115_extended_(2745r) zic1_extended_(3003r) zic2a_extended_(4655r) zic2b_extended_(4655r) zic3_extended_(241r) zic4_extended_(32032r) zic6_extended_(32032r) znf143b_extended_(29475r) znf148_extended_(39387r) znf423_extended_(5715r) znf598_extended_(13284r) znf652_extended_(20798r) znf711_extended_(2325r) znf740a_extended_(34273r) znf740b_extended_(34273r)
# output region_to_gene
scplus_obj.uns['region_to_gene'].to_csv("output/scenicplus/region_to_gene.txt",sep ='\t',index=False,header=True)
# output TF2G_adj
scplus_obj.uns['TF2G_adj'].to_csv("output/scenicplus/TF2G_adj.txt",sep ='\t',index=False,header=True)
# output search_space
scplus_obj.uns['search_space'].to_csv("output/scenicplus/search_space.txt",sep ='\t',index=False,header=True)
# format eRegulons
get_eRegulons_as_signatures(scplus_obj, eRegulon_metadata_key='eRegulon_metadata', key_added='eRegulon_signatures')
# score chromatin layer
import time
start_time = time.time()
region_ranking = make_rankings(scplus_obj, target='region')
score_eRegulons(scplus_obj,
ranking = region_ranking,
eRegulon_signatures_key = 'eRegulon_signatures',
key_added = 'eRegulon_AUC',
enrichment_type= 'region',
auc_threshold = 0.05,
normalize = False,
n_cpu = 1) # if we use more than 1 cpu, there is no enough memory even we ask 270g
time = time.time()-start_time
print(time/60)
100%|██████████| 486/486 [02:51<00:00, 2.83it/s]
43.07801492214203
# score transcriptome layer
import time
start_time = time.time()
gene_ranking = make_rankings(scplus_obj, target='gene')
score_eRegulons(scplus_obj,
ranking = gene_ranking,
eRegulon_signatures_key = 'eRegulon_signatures',
key_added = 'eRegulon_AUC',
enrichment_type= 'gene',
auc_threshold = 0.05,
normalize = False,
n_cpu = 1)
time = time.time()-start_time
print(time/60)
100%|██████████| 486/486 [01:29<00:00, 5.42it/s]
2.0606292963027952
with open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withAUC.pkl', 'wb') as f:
dill.dump(scplus_obj, f)
scplus_obj.uns.keys()
dict_keys(['Cistromes', 'search_space', 'region_to_gene', 'TF2G_adj', 'eRegulons_importance', 'eRegulon_metadata', 'eRegulon_signatures', 'eRegulon_AUC', 'Pseudobulk', 'TF_cistrome_correlation', 'selected_eRegulons', 'eRegulon_AUC_thresholds', 'RSS'])
scplus_obj.uns['eRegulon_AUC']
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withAUC.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
# generate pseudobulks
import time
start_time = time.time()
generate_pseudobulks(scplus_obj,
variable = 'GEX_cell.type',
auc_key = 'eRegulon_AUC',
signature_key = 'Gene_based',
nr_cells = 5,
nr_pseudobulks = 100,
seed=555)
generate_pseudobulks(scplus_obj,
variable = 'GEX_cell.type',
auc_key = 'eRegulon_AUC',
signature_key = 'Region_based',
nr_cells = 5,
nr_pseudobulks = 100,
seed=555)
time = time.time()-start_time
print(time/60)
0.9698616782824199
# correlation between TF and eRegulons
import time
start_time = time.time()
TF_cistrome_correlation(scplus_obj,
variable = 'GEX_cell.type',
auc_key = 'eRegulon_AUC',
signature_key = 'Gene_based',
out_key = 'GEX_cell.type_eGRN_gene_based')
TF_cistrome_correlation(scplus_obj,
variable = 'GEX_cell.type',
auc_key = 'eRegulon_AUC',
signature_key = 'Region_based',
out_key = 'GEX_cell.type_eGRN__region_based')
time = time.time()-start_time
print(time/60)
0.019954482714335125
scplus_obj.uns['TF_cistrome_correlation']['GEX_cell.type_eGRN_gene_based'].head()
| TF | Cistrome | Rho | P-value | Adjusted_p-value | |
|---|---|---|---|---|---|
| 0 | TCF4 | TCF4_+_+_(734g) | 0.643763 | 0.000000e+00 | 0.000000e+00 |
| 1 | TCF4 | TCF4_+_-_(276g) | 0.637050 | 0.000000e+00 | 0.000000e+00 |
| 2 | TCF4 | TCF4_-_+_(14g) | -0.375055 | 4.712733e-315 | 5.289580e-315 |
| 3 | TCF4 | TCF4_extended_+_+_(734g) | 0.643763 | 0.000000e+00 | 0.000000e+00 |
| 4 | TCF4 | TCF4_extended_+_-_(276g) | 0.637050 | 0.000000e+00 | 0.000000e+00 |
We can plot eRegulon enrichment versus TF expression for each pseudobulk.
# Region based
%matplotlib inline
import seaborn as sns
sns.set_style("white")
colors = ["#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
"#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
"#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
"#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
"#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
"#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700",
"#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700"]
categories = sorted(set(scplus_obj.metadata_cell['GEX_cell.type']))
color_dict = dict(zip(categories, colors[0:len(categories)]))
len(colors)
95
dgem = scplus_obj.uns['Pseudobulk']['GEX_cell.type']['Expression'].copy()
dgem
| non-dorsal margin(dome)_0 | non-dorsal margin(dome)_1 | non-dorsal margin(dome)_2 | non-dorsal margin(dome)_3 | non-dorsal margin(dome)_4 | non-dorsal margin(dome)_5 | non-dorsal margin(dome)_6 | non-dorsal margin(dome)_7 | non-dorsal margin(dome)_8 | non-dorsal margin(dome)_9 | ... | dorsal posterior(50epi)_90 | dorsal posterior(50epi)_91 | dorsal posterior(50epi)_92 | dorsal posterior(50epi)_93 | dorsal posterior(50epi)_94 | dorsal posterior(50epi)_95 | dorsal posterior(50epi)_96 | dorsal posterior(50epi)_97 | dorsal posterior(50epi)_98 | dorsal posterior(50epi)_99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| lrmp | 1.144091 | 2.621850 | 3.716425 | 2.382706 | 0.000000 | 0.000000 | 1.194217 | 3.653447 | 1.121054 | 3.740444 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.248244 |
| h1m | 0.000000 | 0.000000 | 1.057777 | 0.000000 | 2.428852 | 0.000000 | 0.000000 | 1.148690 | 0.000000 | 0.000000 | ... | 1.180652 | 0.000000 | 0.000000 | 0.000000 | 1.144152 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.159197 |
| buc | 0.000000 | 0.000000 | 1.123443 | 0.000000 | 0.000000 | 1.260509 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| cldng | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| nanos3 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| si:dkey-37g12.1 | 1.135956 | 0.000000 | 2.235393 | 0.000000 | 2.396255 | 0.000000 | 0.000000 | 0.000000 | 0.983159 | 1.147330 | ... | 1.363163 | 1.351154 | 1.110953 | 0.000000 | 1.144152 | 0.000000 | 2.371025 | 0.000000 | 1.115729 | 1.121162 |
| polrmt | 0.000000 | 2.126545 | 0.000000 | 2.399753 | 0.000000 | 1.260509 | 0.000000 | 1.103503 | 3.620539 | 1.301960 | ... | 1.090825 | 0.000000 | 0.000000 | 2.367095 | 2.276762 | 2.577747 | 0.000000 | 0.000000 | 0.000000 | 1.127082 |
| prkcbp1l | 1.258668 | 3.303022 | 1.396969 | 1.066149 | 3.417448 | 0.000000 | 1.332591 | 3.599749 | 1.208858 | 1.576552 | ... | 1.180652 | 3.851826 | 4.047869 | 5.194694 | 3.288702 | 1.332825 | 3.599356 | 1.226213 | 2.785311 | 2.643868 |
| ap2a1 | 1.282392 | 2.100446 | 1.276825 | 3.490016 | 1.066149 | 2.567899 | 1.307390 | 3.376952 | 0.000000 | 0.000000 | ... | 2.389834 | 4.792479 | 1.110953 | 0.000000 | 3.427004 | 2.427370 | 3.824031 | 0.000000 | 1.115729 | 2.418685 |
| ap2m1a | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.315681 | 1.281786 | 1.194217 | 0.000000 | 0.000000 | 0.000000 | ... | 1.180652 | 1.212757 | 2.232278 | 2.405485 | 1.132610 | 1.246034 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
6054 rows × 9500 columns
pseudobulk_variable = 'GEX_cell.type'
signature_key = 'Region_based'
auc_key = 'eRegulon_AUC'
cistromes_auc = scplus_obj.uns['Pseudobulk'][pseudobulk_variable][auc_key][signature_key].copy()
cistromes_auc
| non-dorsal margin(dome)_0 | non-dorsal margin(dome)_1 | non-dorsal margin(dome)_2 | non-dorsal margin(dome)_3 | non-dorsal margin(dome)_4 | non-dorsal margin(dome)_5 | non-dorsal margin(dome)_6 | non-dorsal margin(dome)_7 | non-dorsal margin(dome)_8 | non-dorsal margin(dome)_9 | ... | dorsal posterior(50epi)_90 | dorsal posterior(50epi)_91 | dorsal posterior(50epi)_92 | dorsal posterior(50epi)_93 | dorsal posterior(50epi)_94 | dorsal posterior(50epi)_95 | dorsal posterior(50epi)_96 | dorsal posterior(50epi)_97 | dorsal posterior(50epi)_98 | dorsal posterior(50epi)_99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TCF4_+_+_(2024r) | 0.016235 | 0.014202 | 0.014010 | 0.014653 | 0.014376 | 0.013424 | 0.015316 | 0.013963 | 0.014099 | 0.013052 | ... | 0.016175 | 0.017039 | 0.016454 | 0.016426 | 0.017534 | 0.017502 | 0.017513 | 0.017611 | 0.017736 | 0.017642 |
| TCF4_+_-_(340r) | 0.168502 | 0.182706 | 0.180257 | 0.176234 | 0.194743 | 0.169320 | 0.194459 | 0.179392 | 0.175435 | 0.182556 | ... | 0.201456 | 0.206847 | 0.199728 | 0.212204 | 0.209357 | 0.199092 | 0.211413 | 0.218812 | 0.212306 | 0.204617 |
| TCF4_-_+_(34r) | 0.071814 | 0.076902 | 0.074792 | 0.078115 | 0.072811 | 0.075210 | 0.088982 | 0.070608 | 0.075925 | 0.071700 | ... | 0.075373 | 0.080140 | 0.068849 | 0.081392 | 0.088094 | 0.080956 | 0.085465 | 0.084446 | 0.093512 | 0.080089 |
| TCF4_extended_+_+_(2024r) | 0.016235 | 0.014202 | 0.014010 | 0.014653 | 0.014376 | 0.013424 | 0.015316 | 0.013963 | 0.014099 | 0.013052 | ... | 0.016175 | 0.017039 | 0.016454 | 0.016426 | 0.017534 | 0.017502 | 0.017513 | 0.017611 | 0.017736 | 0.017642 |
| TCF4_extended_+_-_(340r) | 0.168502 | 0.182706 | 0.180257 | 0.176234 | 0.194743 | 0.169320 | 0.194459 | 0.179392 | 0.175435 | 0.182556 | ... | 0.201456 | 0.206847 | 0.199728 | 0.212204 | 0.209357 | 0.199092 | 0.211413 | 0.218812 | 0.212306 | 0.204617 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| zic2b_extended_-_-_(58r) | 0.212589 | 0.251023 | 0.225907 | 0.222842 | 0.252322 | 0.221213 | 0.222670 | 0.251683 | 0.237691 | 0.250392 | ... | 0.231756 | 0.249851 | 0.228332 | 0.244389 | 0.238694 | 0.230526 | 0.235299 | 0.239331 | 0.231912 | 0.212523 |
| zic3_+_+_(17r) | 0.069496 | 0.080841 | 0.074363 | 0.067693 | 0.081880 | 0.067678 | 0.068008 | 0.091310 | 0.072010 | 0.099404 | ... | 0.077506 | 0.085768 | 0.092735 | 0.077251 | 0.082145 | 0.094189 | 0.089013 | 0.067954 | 0.086340 | 0.069998 |
| zic3_extended_+_+_(17r) | 0.069496 | 0.080841 | 0.074363 | 0.067693 | 0.081880 | 0.067678 | 0.068008 | 0.091310 | 0.072010 | 0.099404 | ... | 0.077506 | 0.085768 | 0.092735 | 0.077251 | 0.082145 | 0.094189 | 0.089013 | 0.067954 | 0.086340 | 0.069998 |
| znf598_+_-_(506r) | 0.227506 | 0.253403 | 0.237788 | 0.233916 | 0.257681 | 0.227425 | 0.246402 | 0.245454 | 0.237513 | 0.245152 | ... | 0.269148 | 0.287723 | 0.281602 | 0.285786 | 0.294475 | 0.284463 | 0.289237 | 0.276735 | 0.290449 | 0.280056 |
| znf598_extended_+_-_(506r) | 0.227506 | 0.253403 | 0.237788 | 0.233916 | 0.257681 | 0.227425 | 0.246402 | 0.245454 | 0.237513 | 0.245152 | ... | 0.269148 | 0.287723 | 0.281602 | 0.285786 | 0.294475 | 0.284463 | 0.289237 | 0.276735 | 0.290449 | 0.280056 |
486 rows × 9500 columns
cistromes_auc.index.values
array(['TCF4_+_+_(2024r)', 'TCF4_+_-_(340r)', 'TCF4_-_+_(34r)',
'TCF4_extended_+_+_(2024r)', 'TCF4_extended_+_-_(340r)',
'TCF4_extended_-_+_(34r)', 'atf3_+_+_(33r)',
'atf3_extended_+_+_(33r)', 'atf6_+_-_(1116r)',
'atf6_extended_+_-_(1116r)', 'atf7a_+_-_(36r)',
'atf7a_extended_+_-_(36r)', 'bhlha15_+_+_(79r)',
'bhlha15_+_-_(125r)', 'bhlha15_extended_+_+_(79r)',
'bhlha15_extended_+_-_(125r)', 'bhlhe41_+_+_(65r)',
'bhlhe41_+_-_(57r)', 'bhlhe41_extended_+_+_(65r)',
'bhlhe41_extended_+_-_(57r)', 'cebpb_+_+_(942r)',
'cebpb_-_-_(43r)', 'cebpb_extended_+_+_(942r)',
'cebpb_extended_-_-_(43r)', 'creb3l2_+_+_(10r)',
'creb3l2_extended_+_+_(10r)', 'ctcf_+_+_(502r)', 'ctcf_+_-_(143r)',
'ctcf_-_+_(29r)', 'ctcf_-_-_(32r)', 'ctcf_extended_+_+_(502r)',
'ctcf_extended_+_-_(143r)', 'ctcf_extended_-_+_(29r)',
'ctcf_extended_-_-_(32r)', 'dmbx1a_+_+_(31r)',
'dmbx1a_extended_+_+_(31r)', 'e2f3_+_+_(27r)', 'e2f3_+_-_(9r)',
'e2f3_extended_+_+_(27r)', 'e2f3_extended_+_-_(9r)',
'e2f5_+_+_(21r)', 'e2f5_extended_+_+_(21r)', 'e2f7_+_+_(679r)',
'e2f7_extended_+_+_(679r)', 'ebf3b_+_-_(10r)',
'ebf3b_extended_+_-_(10r)', 'egr2a_+_+_(88r)',
'egr2a_extended_+_+_(88r)', 'egr2b_+_+_(124r)',
'egr2b_extended_+_+_(124r)', 'elf1_+_+_(67r)', 'elf1_+_-_(207r)',
'elf1_-_+_(39r)', 'elf1_-_-_(20r)', 'elf1_extended_+_+_(67r)',
'elf1_extended_+_-_(207r)', 'elf1_extended_-_+_(39r)',
'elf1_extended_-_-_(20r)', 'elk3_+_-_(41r)',
'elk3_extended_+_-_(41r)', 'en2a_+_+_(22r)',
'en2a_extended_+_+_(22r)', 'erf_+_+_(78r)', 'erf_+_-_(81r)',
'erf_-_+_(14r)', 'erf_extended_+_+_(78r)',
'erf_extended_+_-_(81r)', 'erf_extended_-_+_(14r)',
'erfl3_+_+_(93r)', 'erfl3_+_-_(72r)', 'erfl3_extended_+_+_(93r)',
'erfl3_extended_+_-_(72r)', 'ets2_+_+_(17r)', 'ets2_+_-_(19r)',
'ets2_extended_+_+_(17r)', 'ets2_extended_+_-_(19r)',
'etv5a_+_+_(123r)', 'etv5a_+_-_(77r)', 'etv5a_extended_+_+_(123r)',
'etv5a_extended_+_-_(77r)', 'etv5b_+_+_(166r)', 'etv5b_+_-_(105r)',
'etv5b_extended_+_+_(166r)', 'etv5b_extended_+_-_(105r)',
'fli1a_+_+_(144r)', 'fli1a_+_-_(18r)', 'fli1a_extended_+_+_(144r)',
'fli1a_extended_+_-_(18r)', 'foxa2_+_+_(52r)',
'foxa2_extended_+_+_(52r)', 'foxa_+_+_(109r)', 'foxa_+_-_(10r)',
'foxa_extended_+_+_(109r)', 'foxa_extended_+_-_(10r)',
'foxd3_+_+_(25r)', 'foxd3_extended_+_+_(25r)', 'foxg1a_+_+_(75r)',
'foxg1a_+_-_(19r)', 'foxg1a_extended_+_+_(75r)',
'foxg1a_extended_+_-_(14r)', 'foxo1a_+_+_(30r)',
'foxo1a_extended_+_+_(30r)', 'foxp2_+_+_(25r)',
'foxp2_extended_+_+_(25r)', 'gata2a_+_+_(778r)',
'gata2a_-_+_(64r)', 'gata2a_-_-_(44r)',
'gata2a_extended_+_+_(778r)', 'gata2a_extended_-_+_(64r)',
'gata2a_extended_-_-_(44r)', 'gata4_+_+_(524r)', 'gata4_+_-_(48r)',
'gata4_extended_+_+_(524r)', 'gata4_extended_+_-_(48r)',
'gata5_+_+_(54r)', 'gata5_extended_+_+_(54r)', 'gata6_+_+_(381r)',
'gata6_+_-_(25r)', 'gata6_extended_+_+_(381r)',
'gata6_extended_+_-_(25r)', 'gli1_+_+_(80r)',
'gli1_extended_+_+_(80r)', 'gli2b_+_+_(55r)',
'gli2b_extended_+_+_(55r)', 'gli3_+_+_(22r)',
'gli3_extended_+_+_(22r)', 'glis1b_+_+_(25r)', 'glis1b_+_-_(14r)',
'glis1b_extended_+_+_(20r)', 'glis1b_extended_+_-_(14r)',
'grhl1_+_+_(7182r)', 'grhl1_+_-_(270r)', 'grhl1_-_+_(121r)',
'grhl1_-_-_(613r)', 'grhl1_extended_+_+_(7182r)',
'grhl1_extended_+_-_(270r)', 'grhl1_extended_-_+_(121r)',
'grhl1_extended_-_-_(613r)', 'gsc_+_+_(83r)', 'gsc_+_-_(24r)',
'gsc_extended_+_+_(83r)', 'gsc_extended_+_-_(28r)',
'her6_+_+_(521r)', 'her6_+_-_(623r)', 'her6_-_+_(20r)',
'her6_extended_+_+_(521r)', 'her6_extended_+_-_(623r)',
'her6_extended_-_+_(20r)', 'her9_+_+_(502r)', 'her9_+_-_(77r)',
'her9_extended_+_+_(502r)', 'her9_extended_+_-_(77r)',
'hey1_+_+_(1325r)', 'hey1_+_-_(157r)', 'hey1_extended_+_+_(1325r)',
'hey1_extended_+_-_(157r)', 'hic1_+_+_(612r)',
'hic1_extended_+_+_(612r)', 'hnf1ba_+_+_(74r)',
'hnf1ba_extended_+_+_(74r)', 'hnf4a_+_+_(756r)', 'hnf4a_+_-_(35r)',
'hnf4a_-_-_(18r)', 'hnf4a_extended_+_+_(756r)',
'hnf4a_extended_+_-_(35r)', 'hnf4a_extended_-_-_(18r)',
'hnf4b_+_+_(834r)', 'hnf4b_+_-_(45r)', 'hnf4b_extended_+_+_(834r)',
'hnf4b_extended_+_-_(45r)', 'hoxa9a_+_+_(86r)',
'hoxa9a_extended_+_+_(86r)', 'hoxc1a_+_+_(13r)',
'hoxc1a_extended_+_+_(13r)', 'hoxc9a_+_+_(55r)',
'hoxc9a_extended_+_+_(55r)', 'klf12a_+_+_(21r)',
'klf12a_extended_+_+_(21r)', 'klf13_+_+_(49r)', 'klf13_+_-_(26r)',
'klf13_extended_+_+_(49r)', 'klf13_extended_+_-_(26r)',
'klf3_+_-_(18r)', 'klf3_extended_+_-_(18r)', 'klf4_+_+_(1451r)',
'klf4_+_-_(306r)', 'klf4_extended_+_+_(1451r)',
'klf4_extended_+_-_(306r)', 'klf6a_+_+_(66r)', 'klf6a_+_-_(355r)',
'klf6a_-_+_(72r)', 'klf6a_-_-_(18r)', 'klf6a_extended_+_+_(66r)',
'klf6a_extended_+_-_(355r)', 'klf6a_extended_-_+_(72r)',
'klf6a_extended_-_-_(18r)', 'klf8_+_+_(54r)', 'klf8_+_-_(15r)',
'klf8_extended_+_+_(54r)', 'klf8_extended_+_-_(15r)',
'mafa_+_-_(19r)', 'mafa_extended_+_-_(19r)', 'meis3_+_+_(37r)',
'meis3_extended_+_+_(37r)', 'mespaa_+_+_(33r)',
'mespaa_extended_+_+_(33r)', 'mespab_+_+_(30r)',
'mespab_extended_+_+_(30r)', 'mlxipl_+_+_(19r)',
'mlxipl_extended_+_+_(19r)', 'mxi1_+_+_(21r)', 'mxi1_+_-_(56r)',
'mxi1_extended_+_+_(21r)', 'mxi1_extended_+_-_(56r)',
'mych_+_+_(11r)', 'mych_+_-_(12r)', 'mych_extended_+_+_(11r)',
'mych_extended_+_-_(12r)', 'mycn_+_+_(33r)',
'mycn_extended_+_+_(33r)', 'myod1_+_+_(114r)',
'myod1_extended_+_+_(114r)', 'neurod1_+_+_(17r)',
'neurod1_extended_+_+_(17r)', 'nfya_+_+_(97r)', 'nfya_+_-_(31r)',
'nfya_extended_+_+_(97r)', 'nfya_extended_+_-_(47r)',
'nr2f2_+_+_(70r)', 'nr2f2_extended_+_+_(70r)', 'nr2f6b_+_+_(258r)',
'nr2f6b_+_-_(76r)', 'nr2f6b_extended_+_+_(258r)',
'nr2f6b_extended_+_-_(76r)', 'nr6a1b_+_+_(75r)',
'nr6a1b_+_-_(31r)', 'nr6a1b_extended_+_+_(75r)',
'nr6a1b_extended_+_-_(31r)', 'onecut1_+_+_(3304r)',
'onecut1_+_-_(45r)', 'onecut1_-_-_(36r)',
'onecut1_extended_+_+_(3304r)', 'onecut1_extended_+_-_(45r)',
'onecut1_extended_-_-_(36r)', 'otx1_+_+_(361r)', 'otx1_+_-_(67r)',
'otx1_-_+_(24r)', 'otx1_extended_+_+_(361r)',
'otx1_extended_+_-_(67r)', 'otx1_extended_-_+_(24r)',
'otx2a_+_+_(301r)', 'otx2a_+_-_(77r)', 'otx2a_-_+_(29r)',
'otx2a_extended_+_+_(301r)', 'otx2a_extended_+_-_(77r)',
'otx2a_extended_-_+_(29r)', 'otx2b_+_+_(83r)',
'otx2b_extended_+_+_(83r)', 'pitx2_+_+_(31r)',
'pitx2_extended_+_+_(31r)', 'pknox2_+_+_(23r)',
'pknox2_extended_+_+_(23r)', 'pou2f2a_+_-_(70r)',
'pou2f2a_extended_+_-_(70r)', 'prox1a_+_+_(88r)',
'prox1a_extended_+_+_(88r)', 'rest_+_+_(11r)', 'rest_+_-_(67r)',
'rest_extended_+_+_(11r)', 'rest_extended_+_-_(67r)',
'rfx1a_+_+_(20r)', 'rfx1a_extended_+_+_(33r)', 'rfx2_+_+_(107r)',
'rfx2_+_-_(47r)', 'rfx2_extended_+_+_(107r)',
'rfx2_extended_+_-_(47r)', 'rfx3_+_+_(387r)',
'rfx3_extended_+_+_(387r)', 'rfx4_+_+_(173r)',
'rfx4_extended_+_+_(173r)', 'rreb1a_+_+_(648r)',
'rreb1a_+_-_(813r)', 'rreb1a_-_-_(14r)',
'rreb1a_extended_+_+_(648r)', 'rreb1a_extended_+_-_(813r)',
'rreb1a_extended_-_-_(14r)', 'rreb1b_+_+_(1219r)',
'rreb1b_-_+_(28r)', 'rreb1b_-_-_(20r)',
'rreb1b_extended_+_+_(1219r)', 'rreb1b_extended_-_+_(28r)',
'rreb1b_extended_-_-_(20r)', 'rxraa_+_+_(151r)', 'rxraa_-_+_(13r)',
'rxraa_extended_+_+_(151r)', 'rxraa_extended_-_+_(13r)',
'rxrgb_+_+_(375r)', 'rxrgb_+_-_(41r)', 'rxrgb_extended_+_+_(375r)',
'rxrgb_extended_+_-_(41r)', 'six3b_+_-_(17r)',
'six3b_extended_+_-_(17r)', 'six7_+_-_(13r)',
'six7_extended_+_-_(13r)', 'snai1a_+_+_(79r)', 'snai1a_+_-_(40r)',
'snai1a_extended_+_+_(79r)', 'snai1a_extended_+_-_(40r)',
'sox19a_+_+_(59r)', 'sox19a_extended_+_+_(59r)',
'sox19b_+_+_(58r)', 'sox19b_extended_+_+_(58r)', 'sox2_+_+_(56r)',
'sox2_extended_+_+_(56r)', 'sox3_+_+_(70r)',
'sox3_extended_+_+_(70r)', 'sox3_extended_+_-_(17r)',
'sox9a_+_+_(17r)', 'sox9a_extended_+_+_(17r)', 'sox9b_+_-_(14r)',
'sox9b_extended_+_-_(14r)', 'stat5a_+_-_(62r)',
'stat5a_extended_+_-_(62r)', 'tbx16_+_+_(363r)', 'tbx16_+_-_(39r)',
'tbx16_-_+_(27r)', 'tbx16_-_-_(72r)', 'tbx16_extended_+_+_(363r)',
'tbx16_extended_+_-_(39r)', 'tbx16_extended_-_+_(27r)',
'tbx16_extended_-_-_(72r)', 'tbx16l_+_+_(198r)',
'tbx16l_+_-_(20r)', 'tbx16l_-_+_(14r)', 'tbx16l_-_-_(15r)',
'tbx16l_extended_+_+_(198r)', 'tbx16l_extended_+_-_(21r)',
'tbx16l_extended_-_+_(14r)', 'tbx16l_extended_-_-_(15r)',
'tbx19_+_+_(55r)', 'tbx19_+_-_(12r)', 'tbx19_extended_+_+_(55r)',
'tbx19_extended_+_-_(12r)', 'tbx1_+_+_(503r)', 'tbx1_+_-_(115r)',
'tbx1_extended_+_+_(503r)', 'tbx1_extended_+_-_(115r)',
'tbx20_+_+_(11r)', 'tbx20_extended_+_+_(11r)', 'tbx2a_+_-_(21r)',
'tbx2a_extended_+_-_(21r)', 'tbx2b_+_+_(33r)', 'tbx2b_+_-_(63r)',
'tbx2b_extended_+_+_(33r)', 'tbx2b_extended_+_-_(63r)',
'tbx5a_+_-_(19r)', 'tbx5a_extended_+_-_(19r)', 'tbxta_+_+_(1129r)',
'tbxta_+_-_(115r)', 'tbxta_-_+_(113r)', 'tbxta_-_-_(75r)',
'tbxta_extended_+_+_(1129r)', 'tbxta_extended_+_-_(115r)',
'tbxta_extended_-_+_(113r)', 'tbxta_extended_-_-_(75r)',
'tbxtb_+_+_(870r)', 'tbxtb_-_-_(19r)', 'tbxtb_extended_+_+_(870r)',
'tbxtb_extended_-_-_(19r)', 'tcf12_+_+_(1197r)', 'tcf12_-_+_(37r)',
'tcf12_-_-_(17r)', 'tcf12_extended_+_+_(1197r)',
'tcf12_extended_-_+_(37r)', 'tcf12_extended_-_-_(17r)',
'tcf3a_+_+_(335r)', 'tcf3a_+_-_(432r)', 'tcf3a_-_-_(12r)',
'tcf3a_extended_+_+_(335r)', 'tcf3a_extended_+_-_(432r)',
'tcf3b_+_+_(556r)', 'tcf3b_+_-_(127r)',
'tcf3b_extended_+_+_(556r)', 'tcf3b_extended_+_-_(127r)',
'tcf7_+_+_(31r)', 'tcf7_extended_+_+_(31r)', 'tcf7l1a_+_+_(92r)',
'tcf7l1a_+_-_(28r)', 'tcf7l1a_-_+_(64r)',
'tcf7l1a_extended_+_+_(92r)', 'tcf7l1a_extended_+_-_(28r)',
'tcf7l1a_extended_-_+_(64r)', 'tcf7l1b_+_+_(60r)',
'tcf7l1b_+_-_(30r)', 'tcf7l1b_-_+_(83r)',
'tcf7l1b_extended_+_+_(60r)', 'tcf7l1b_extended_+_-_(30r)',
'tcf7l1b_extended_-_+_(83r)', 'tcf7l2_+_+_(122r)',
'tcf7l2_+_-_(34r)', 'tcf7l2_extended_+_+_(122r)',
'tcf7l2_extended_+_-_(35r)', 'tead1a_+_+_(199r)',
'tead1a_+_-_(229r)', 'tead1a_extended_+_+_(199r)',
'tead1a_extended_+_-_(229r)', 'tead1b_+_+_(435r)',
'tead1b_+_-_(59r)', 'tead1b_-_-_(13r)',
'tead1b_extended_+_+_(435r)', 'tead1b_extended_+_-_(59r)',
'tead1b_extended_-_-_(13r)', 'tead3a_+_+_(1530r)',
'tead3a_+_-_(100r)', 'tead3a_extended_+_+_(1530r)',
'tead3a_extended_+_-_(100r)', 'tead3b_+_+_(1830r)',
'tead3b_+_-_(99r)', 'tead3b_extended_+_+_(1830r)',
'tead3b_extended_+_-_(99r)', 'tefa_+_+_(128r)', 'tefa_+_-_(134r)',
'tefa_extended_+_+_(128r)', 'tefa_extended_+_-_(134r)',
'tfap2a_+_+_(2300r)', 'tfap2a_+_-_(658r)', 'tfap2a_-_+_(126r)',
'tfap2a_-_-_(74r)', 'tfap2a_extended_+_+_(2300r)',
'tfap2a_extended_+_-_(641r)', 'tfap2a_extended_-_+_(126r)',
'tfap2a_extended_-_-_(74r)', 'tfap2c_+_+_(2564r)',
'tfap2c_+_-_(136r)', 'tfap2c_-_+_(173r)', 'tfap2c_-_-_(67r)',
'tfap2c_extended_+_+_(2564r)', 'tfap2c_extended_+_-_(136r)',
'tfap2c_extended_-_+_(173r)', 'tfap2c_extended_-_-_(58r)',
'tfap2e_+_-_(49r)', 'tfap2e_extended_+_-_(49r)',
'tfap4_+_-_(400r)', 'tfap4_-_+_(76r)', 'tfap4_extended_+_-_(400r)',
'tfap4_extended_-_+_(76r)', 'tfeb_+_+_(44r)', 'tfeb_+_-_(32r)',
'tfeb_extended_+_+_(44r)', 'tfeb_extended_+_-_(56r)',
'tp53_+_+_(78r)', 'tp53_extended_+_+_(78r)', 'vdrb_+_-_(19r)',
'vdrb_extended_+_-_(19r)', 'wt1a_+_+_(70r)', 'wt1a_+_-_(39r)',
'wt1a_extended_+_+_(70r)', 'wt1a_extended_+_-_(39r)',
'ybx1_+_+_(47r)', 'ybx1_+_-_(21r)', 'ybx1_extended_+_+_(47r)',
'ybx1_extended_+_-_(21r)', 'zbtb18_+_+_(876r)', 'zbtb18_+_-_(28r)',
'zbtb18_extended_+_+_(876r)', 'zbtb18_extended_+_-_(27r)',
'zeb1a_+_+_(10r)', 'zeb1a_extended_+_+_(10r)', 'zic1_+_+_(91r)',
'zic1_extended_+_+_(91r)', 'zic2a_+_+_(413r)', 'zic2a_+_-_(64r)',
'zic2a_extended_+_+_(413r)', 'zic2a_extended_+_-_(64r)',
'zic2b_+_+_(229r)', 'zic2b_+_-_(88r)', 'zic2b_-_+_(31r)',
'zic2b_-_-_(58r)', 'zic2b_extended_+_+_(229r)',
'zic2b_extended_+_-_(88r)', 'zic2b_extended_-_+_(31r)',
'zic2b_extended_-_-_(58r)', 'zic3_+_+_(17r)',
'zic3_extended_+_+_(17r)', 'znf598_+_-_(506r)',
'znf598_extended_+_-_(506r)'], dtype=object)
cell_data = pd.DataFrame([x.rsplit('_', 1)[0] for x in cistromes_auc.columns], index=cistromes_auc.columns).iloc[:, 0]
cell_data
non-dorsal margin(dome)_0 non-dorsal margin(dome)
non-dorsal margin(dome)_1 non-dorsal margin(dome)
non-dorsal margin(dome)_2 non-dorsal margin(dome)
non-dorsal margin(dome)_3 non-dorsal margin(dome)
non-dorsal margin(dome)_4 non-dorsal margin(dome)
...
dorsal posterior(50epi)_95 dorsal posterior(50epi)
dorsal posterior(50epi)_96 dorsal posterior(50epi)
dorsal posterior(50epi)_97 dorsal posterior(50epi)
dorsal posterior(50epi)_98 dorsal posterior(50epi)
dorsal posterior(50epi)_99 dorsal posterior(50epi)
Name: 0, Length: 9500, dtype: object
name='TCF4_+_+'
name.split('_')[0]
'TCF4'
tf_expr = dgem.loc[name.split('_')[0], :]
tf_acc = cistromes_auc.index[cistromes_auc.index.str.contains(name + '_(', regex=False)][0]
cistromes_auc.index
Index(['TCF4_+_+_(2024r)', 'TCF4_+_-_(340r)', 'TCF4_-_+_(34r)',
'TCF4_extended_+_+_(2024r)', 'TCF4_extended_+_-_(340r)',
'TCF4_extended_-_+_(34r)', 'atf3_+_+_(33r)', 'atf3_extended_+_+_(33r)',
'atf6_+_-_(1116r)', 'atf6_extended_+_-_(1116r)',
...
'zic2b_-_+_(31r)', 'zic2b_-_-_(58r)', 'zic2b_extended_+_+_(229r)',
'zic2b_extended_+_-_(88r)', 'zic2b_extended_-_+_(31r)',
'zic2b_extended_-_-_(58r)', 'zic3_+_+_(17r)', 'zic3_extended_+_+_(17r)',
'znf598_+_-_(506r)', 'znf598_extended_+_-_(506r)'],
dtype='object', length=486)
cistromes_auc_tf = cistromes_auc.loc[tf_acc, :]
prune_plot(scplus_obj,
'TCF4_+_+',
pseudobulk_variable = 'GEX_cell.type',
show_dot_plot = True,
show_line_plot = False,
color_dict = color_dict,
use_pseudobulk = True,
auc_key = 'eRegulon_AUC',
signature_key = 'Region_based',
seed=555)
# Gene based
prune_plot(scplus_obj,
'TCF4_+_+',
pseudobulk_variable = 'GEX_cell.type',
show_dot_plot = True,
show_line_plot = False,
color_dict = color_dict,
use_pseudobulk = True,
auc_key = 'eRegulon_AUC',
signature_key = 'Gene_based',
seed=555)
# output
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withBinarizedAUC.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
scplus_obj.uns.keys()
dict_keys(['Cistromes', 'search_space', 'region_to_gene', 'TF2G_adj', 'eRegulons_importance', 'eRegulon_metadata', 'eRegulon_signatures', 'eRegulon_AUC', 'Pseudobulk', 'TF_cistrome_correlation', 'selected_eRegulons', 'eRegulon_AUC_thresholds'])
# TF_cistrome_correlation (eGRN_gene_based)
scplus_obj.uns['TF_cistrome_correlation']['GEX_cell.type_eGRN_gene_based'].to_csv("output/scenicplus/TF_cistrome_gene_based_correlation.txt",sep ='\t',index=False,header=True)
# TF_cistrome_correlation (eGRN__region_based)
scplus_obj.uns['TF_cistrome_correlation']['GEX_cell.type_eGRN__region_based'].to_csv("output/scenicplus/TF_cistrome_region_based_correlation.txt",sep ='\t',index=False,header=True)
We will select a subset of regulons based on the correlation between the region based and the gene based regulons. We will only use the extended eRegulon if there is not a direct eRegulon available.
# correlation between region based regulons and gene based regulons
df1 = scplus_obj.uns['eRegulon_AUC']['Gene_based'].copy()
df2 = scplus_obj.uns['eRegulon_AUC']['Region_based'].copy()
df1.columns = [x.split('_(')[0] for x in df1.columns]
df2.columns = [x.split('_(')[0] for x in df2.columns]
correlations = df1.corrwith(df2, axis = 0)
correlations = correlations[abs(correlations) > 0.6]
# kepp only R2G +
keep = [x for x in correlations.index if '+_+' in x] + [x for x in correlations.index if '-_+' in x]
# keep extended if not direct
extended = [x for x in keep if 'extended' in x]
direct = [x for x in keep if not 'extended' in x]
keep_extended = [x for x in extended if not x.replace('extended_', '') in direct]
keep = direct + keep_extended
# keep regulons with more than 10 genes
keep_gene = [x for x in scplus_obj.uns['eRegulon_AUC']['Gene_based'].columns if x.split('_(')[0] in keep]
keep_gene = [x for x in keep_gene if (int(x.split('_(')[1].replace('g)', '')) > 10)]
keep_all = [x.split('_(')[0] for x in keep_gene]
keep_region = [x for x in scplus_obj.uns['eRegulon_AUC']['Region_based'].columns if x.split('_(')[0] in keep]
scplus_obj.uns['selected_eRegulons'] = {}
scplus_obj.uns['selected_eRegulons']['Gene_based'] = keep_gene
scplus_obj.uns['selected_eRegulons']['Region_based'] = keep_region
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withRSS.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
correlation_heatmap(scplus_obj,
auc_key = 'eRegulon_AUC',
signature_keys = ['Gene_based'],
selected_regulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
fcluster_threshold = 0.1,
save = 'output/network_analysis_results/regulon_cluster.pdf',
fontsize = 3)
# we can also check the overlap between eRegulons:
jaccard_heatmap(scplus_obj,
gene_or_region_based = 'Gene_based',
signature_key = 'eRegulon_signatures',
selected_regulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
fcluster_threshold = 0.1,
fontsize = 3,
save = 'output/network_analysis_results/regulon_overlap.pdf',
method='intersect')
# we can also binarize the eRegulons as in SCENIC. This information will be used afterwards for generating the loom file
binarize_AUC(scplus_obj,
auc_key='eRegulon_AUC',
out_key='eRegulon_AUC_thresholds',
signature_keys=['Gene_based', 'Region_based'],
n_cpu=8)
with open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withBinarizedAUC.pkl', 'wb') as f:
dill.dump(scplus_obj, f)
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withBinarizedAUC.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
regulon_specificity_scores(scplus_obj,
'GEX_cell.type',
signature_keys=['Gene_based'],
selected_regulons=scplus_obj.uns['selected_eRegulons']['Gene_based'],
out_key_suffix='_gene_based',
scale=False)
plot_rss(scplus_obj, 'GEX_cell.type_gene_based', num_columns=4, top_n=10,
save='output/network_analysis_results/celltype_specific_regulon.pdf')
/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/IPython/core/pylabtools.py:151: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
heatmap_dotplot(
scplus_obj = scplus_obj,
size_matrix = scplus_obj.uns['RSS']['GEX_cell.type_gene_based'],
color_matrix = scplus_obj.to_df('EXP'),
scale_size_matrix = True,
scale_color_matrix = True,
group_variable = 'GEX_cell.type',
subset_eRegulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
figsize = (25, 20),
orientation = 'vertical',
save = 'output/network_analysis_results/regulon_specificity.pdf',
split_repressor_activator=True)
/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:718: PlotnineWarning: Saving 25 x 20 in image. /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:719: PlotnineWarning: Filename: output/network_analysis_results/regulon_specificity.pdf
colnames = list()
for col in scplus_obj.uns['eRegulon_AUC']['Region_based'].columns:
if 'extended' not in col and ('+_+' in col or '-_+' in col):
colnames += [col]
len(colnames)
141
scplus_obj.uns['eRegulon_AUC']['Region_based_withoutExtend'] = scplus_obj.uns['eRegulon_AUC']['Region_based'][colnames]
scplus_obj.uns['eRegulon_AUC']['Region_based_withoutExtend']
| TCF4_+_+_(2024r) | TCF4_-_+_(34r) | atf3_+_+_(33r) | bhlha15_+_+_(79r) | bhlhe41_+_+_(65r) | cebpb_+_+_(942r) | creb3l2_+_+_(10r) | ctcf_+_+_(502r) | ctcf_-_+_(29r) | dmbx1a_+_+_(31r) | ... | tp53_+_+_(78r) | wt1a_+_+_(70r) | ybx1_+_+_(47r) | zbtb18_+_+_(876r) | zeb1a_+_+_(10r) | zic1_+_+_(91r) | zic2a_+_+_(413r) | zic2b_+_+_(229r) | zic2b_-_+_(31r) | zic3_+_+_(17r) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cell | |||||||||||||||||||||
| 30epi_AAACAGCCAGTTGCGT-1___cisTopic | 0.236184 | 0.029294 | 0.232283 | 0.162929 | 0.122100 | 0.190718 | 0.123698 | 0.026957 | 0.019013 | 0.025652 | ... | 0.118459 | 0.036569 | 0.123082 | 0.017417 | 0.126264 | 0.016276 | 0.047055 | 0.099914 | 0.111129 | 0.003803 |
| 30epi_AAACCAACACCTAAGC-1___cisTopic | 0.013967 | 0.064256 | 0.175003 | 0.072468 | 0.138885 | 0.004360 | 0.250777 | 0.053468 | 0.035071 | 0.047464 | ... | 0.016587 | 0.034934 | 0.162504 | 0.026205 | 0.237600 | 0.041107 | 0.103522 | 0.251155 | 0.102911 | 0.079095 |
| 30epi_AAACCGAAGGTTTGCG-1___cisTopic | 0.014294 | 0.071337 | 0.175724 | 0.069385 | 0.142614 | 0.004268 | 0.261329 | 0.053448 | 0.024866 | 0.041074 | ... | 0.020542 | 0.038202 | 0.166890 | 0.028121 | 0.211475 | 0.042303 | 0.096785 | 0.214687 | 0.120824 | 0.055013 |
| 30epi_AAACGCGCAGTAAGTA-1___cisTopic | 0.013777 | 0.056596 | 0.193737 | 0.081006 | 0.184027 | 0.004791 | 0.375361 | 0.047252 | 0.027643 | 0.049907 | ... | 0.030226 | 0.048837 | 0.149964 | 0.024393 | 0.216769 | 0.034169 | 0.107139 | 0.239600 | 0.100849 | 0.108058 |
| 30epi_AAAGCACCAGCATTAT-1___cisTopic | 0.014109 | 0.058481 | 0.184313 | 0.071980 | 0.155580 | 0.003913 | 0.376766 | 0.046059 | 0.017559 | 0.044940 | ... | 0.033327 | 0.067920 | 0.175644 | 0.033080 | 0.279368 | 0.039814 | 0.104459 | 0.229825 | 0.137459 | 0.068201 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| shield_s4_TTTGTCTAGGTTTGAC-1___cisTopic | 0.016303 | 0.082319 | 0.118115 | 0.077388 | 0.181197 | 0.004776 | 0.348147 | 0.036602 | 0.022523 | 0.041467 | ... | 0.038786 | 0.048248 | 0.136296 | 0.045564 | 0.145694 | 0.038615 | 0.097767 | 0.185086 | 0.204019 | 0.054160 |
| shield_s4_TTTGTCTAGTCACCTC-1___cisTopic | 0.014467 | 0.043104 | 0.131677 | 0.121333 | 0.191412 | 0.004147 | 0.507955 | 0.034547 | 0.031505 | 0.098749 | ... | 0.040940 | 0.116200 | 0.117108 | 0.056584 | 0.215721 | 0.120950 | 0.180552 | 0.304227 | 0.255366 | 0.217991 |
| shield_s4_TTTGTCTAGTCAGGCC-1___cisTopic | 0.016600 | 0.128103 | 0.099268 | 0.079432 | 0.146422 | 0.003817 | 0.302809 | 0.032908 | 0.006712 | 0.038093 | ... | 0.028671 | 0.060112 | 0.139824 | 0.122713 | 0.122987 | 0.045252 | 0.121687 | 0.185533 | 0.212812 | 0.052990 |
| shield_s4_TTTGTGGCACATGCTA-1___cisTopic | 0.013606 | 0.028060 | 0.143696 | 0.186219 | 0.194413 | 0.004218 | 0.459560 | 0.035174 | 0.016028 | 0.114225 | ... | 0.039883 | 0.096790 | 0.121368 | 0.042640 | 0.246653 | 0.117839 | 0.159517 | 0.267571 | 0.203638 | 0.161297 |
| shield_s4_TTTGTTGGTATTGAGT-1___cisTopic | 0.313955 | 0.030738 | 0.107686 | 0.213307 | 0.121744 | 0.249302 | 0.088372 | 0.022332 | 0.025721 | 0.022761 | ... | 0.140332 | 0.025315 | 0.081890 | 0.017971 | 0.050867 | 0.017883 | 0.030802 | 0.062230 | 0.122371 | 0.024670 |
40992 rows × 141 columns
heatmap_dotplot(
scplus_obj = scplus_obj,
size_matrix = scplus_obj.uns['eRegulon_AUC']['Region_based_withoutExtend'], #specify what to plot as dot sizes, target region enrichment in this case
color_matrix = scplus_obj.to_df('EXP'), #specify what to plot as colors, TF expression in this case
scale_size_matrix = True,
scale_color_matrix = True,
group_variable = 'GEX_cell.type',
subset_eRegulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
figsize = (25, 20),
orientation = 'vertical',
save = 'output/network_analysis_results/regulon_enhancerActivity.pdf',
split_repressor_activator=True)
/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:718: PlotnineWarning: Saving 25 x 20 in image. /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:719: PlotnineWarning: Filename: output/network_analysis_results/regulon_enhancerActivity.pdf
scplus_obj.uns['RSS']['GEX_cell.type_gene_based']
| TCF4_+_+_(734g) | bhlha15_+_+_(64g) | cebpb_+_+_(556g) | ctcf_+_+_(277g) | ctcf_-_+_(22g) | dmbx1a_+_+_(26g) | egr2a_+_+_(35g) | egr2b_+_+_(42g) | elf1_+_+_(67g) | elf1_-_+_(30g) | ... | tead3b_+_+_(782g) | tfap2a_+_+_(545g) | tfap2c_+_+_(485g) | tfap2c_-_+_(73g) | ybx1_+_+_(45g) | zbtb18_+_+_(205g) | zic1_+_+_(41g) | zic2a_+_+_(140g) | zic2b_+_+_(118g) | zic3_+_+_(17g) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| evl(30epi) | 0.204734 | 0.173197 | 0.208967 | 0.170580 | 0.170599 | 0.168651 | 0.169338 | 0.170173 | 0.186495 | 0.168435 | ... | 0.189761 | 0.180227 | 0.170886 | 0.172020 | 0.169481 | 0.168878 | 0.168152 | 0.169417 | 0.170190 | 0.169013 |
| dorsal margin(30epi) | 0.172401 | 0.170434 | 0.172491 | 0.178957 | 0.168433 | 0.172360 | 0.170333 | 0.171769 | 0.173713 | 0.175362 | ... | 0.172646 | 0.171010 | 0.173579 | 0.173539 | 0.177311 | 0.170977 | 0.170828 | 0.172688 | 0.174987 | 0.171819 |
| non-dorsal margin(30epi) | 0.175760 | 0.174109 | 0.176001 | 0.186304 | 0.168291 | 0.174474 | 0.169623 | 0.171931 | 0.177755 | 0.178147 | ... | 0.175526 | 0.173970 | 0.178146 | 0.175478 | 0.185385 | 0.172035 | 0.171323 | 0.176345 | 0.179529 | 0.172820 |
| ectoderm(30epi) | 0.187082 | 0.176057 | 0.188877 | 0.218329 | 0.170317 | 0.185453 | 0.169627 | 0.173886 | 0.188271 | 0.201121 | ... | 0.188973 | 0.184852 | 0.196404 | 0.174310 | 0.212606 | 0.178858 | 0.179545 | 0.183302 | 0.204064 | 0.180442 |
| apoptosis like(30epi) | 0.168281 | 0.168059 | 0.168378 | 0.169877 | 0.167528 | 0.167728 | 0.167645 | 0.167763 | 0.168408 | 0.168871 | ... | 0.168435 | 0.168303 | 0.168909 | 0.167792 | 0.169683 | 0.167944 | 0.168182 | 0.168201 | 0.169103 | 0.167953 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| dorsal ectoderm(shield) | 0.224567 | 0.209960 | 0.223933 | 0.282054 | 0.188495 | 0.232317 | 0.229591 | 0.219829 | 0.230558 | 0.285438 | ... | 0.230473 | 0.233360 | 0.254569 | 0.218797 | 0.283444 | 0.218498 | 0.234391 | 0.241405 | 0.280718 | 0.248107 |
| endoderm(shield) | 0.174806 | 0.178771 | 0.174685 | 0.178753 | 0.173080 | 0.177627 | 0.174959 | 0.177863 | 0.178295 | 0.176696 | ... | 0.175762 | 0.174788 | 0.177652 | 0.178361 | 0.178528 | 0.173511 | 0.173169 | 0.179302 | 0.175756 | 0.177557 |
| evl(shield) | 0.278154 | 0.188867 | 0.285436 | 0.171548 | 0.180985 | 0.168858 | 0.172374 | 0.172595 | 0.219142 | 0.169095 | ... | 0.250275 | 0.215063 | 0.177524 | 0.177564 | 0.172052 | 0.173389 | 0.174750 | 0.173603 | 0.173627 | 0.180771 |
| ysl(shield) | 0.170037 | 0.171708 | 0.171291 | 0.168991 | 0.169667 | 0.167780 | 0.168067 | 0.167683 | 0.169875 | 0.168787 | ... | 0.174798 | 0.169435 | 0.169761 | 0.169892 | 0.168143 | 0.168559 | 0.167910 | 0.168900 | 0.168028 | 0.171475 |
| dorsal anterior(shield) | 0.172001 | 0.182495 | 0.171776 | 0.173761 | 0.173514 | 0.178601 | 0.171212 | 0.175501 | 0.173359 | 0.171793 | ... | 0.173269 | 0.172551 | 0.173015 | 0.173874 | 0.171888 | 0.171918 | 0.173317 | 0.176316 | 0.172409 | 0.176351 |
95 rows × 84 columns
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withRSS.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
export_to_loom(scplus_obj,
signature_key = 'Gene_based',
tree_structure = ('10x_multiome_zebrafish', 'SCENIC+'),
title = 'Tutorial - Gene based eGRN',
nomenclature = "danRer11",
out_fname=outDir + 'scenicplus/zebrafish_eGRN_gene_based.loom')
2022-11-09 22:19:28,505 SCENIC+ INFO Formatting data 2022-11-09 22:19:31,483 SCENIC+ INFO Creating minimal loom 2022-11-09 22:20:40,918 SCENIC+ INFO Adding annotations 2022-11-09 22:20:44,780 SCENIC+ INFO Exporting
export_to_loom(scplus_obj,
signature_key = 'Region_based',
tree_structure = ('10x_multiome_zebrafish', 'SCENIC+'),
title = 'Tutorial - Gene based eGRN',
nomenclature = "danRer11",
out_fname=outDir + 'scenicplus/zebrafish_eGRN_region_based.loom')
2022-11-09 22:36:43,375 SCENIC+ INFO Formatting data
(scplus_obj.uns['selected_eRegulons']['Gene_based'])
['TCF4_+_+_(734g)', 'bhlha15_+_+_(64g)', 'cebpb_+_+_(556g)', 'ctcf_+_+_(277g)', 'ctcf_-_+_(22g)', 'dmbx1a_+_+_(26g)', 'egr2a_+_+_(35g)', 'egr2b_+_+_(42g)', 'elf1_+_+_(67g)', 'elf1_-_+_(30g)', 'en2a_+_+_(17g)', 'erf_-_+_(14g)', 'etv5b_+_+_(136g)', 'fli1a_+_+_(61g)', 'foxa_+_+_(85g)', 'foxg1a_+_+_(29g)', 'gata2a_+_+_(269g)', 'gata4_+_+_(361g)', 'gata5_+_+_(47g)', 'gata6_+_+_(329g)', 'gli1_+_+_(60g)', 'gli2b_+_+_(53g)', 'gli3_+_+_(20g)', 'glis1b_+_+_(23g)', 'grhl1_+_+_(976g)', 'gsc_+_+_(53g)', 'her6_+_+_(215g)', 'her6_-_+_(12g)', 'her9_+_+_(211g)', 'hey1_+_+_(388g)', 'hic1_+_+_(204g)', 'hnf1ba_+_+_(71g)', 'hnf4a_+_+_(565g)', 'hnf4b_+_+_(603g)', 'hoxa9a_+_+_(52g)', 'hoxc9a_+_+_(33g)', 'klf4_+_+_(647g)', 'klf6a_+_+_(56g)', 'klf6a_-_+_(40g)', 'meis3_+_+_(32g)', 'mespaa_+_+_(29g)', 'myod1_+_+_(57g)', 'nfya_+_+_(68g)', 'nr2f2_+_+_(55g)', 'nr2f6b_+_+_(184g)', 'onecut1_+_+_(947g)', 'otx1_+_+_(140g)', 'otx2a_+_+_(154g)', 'otx2b_+_+_(58g)', 'pitx2_+_+_(32g)', 'prox1a_+_+_(85g)', 'rfx3_+_+_(204g)', 'rfx4_+_+_(141g)', 'rreb1b_+_+_(489g)', 'rxraa_+_+_(86g)', 'rxrgb_+_+_(259g)', 'snai1a_+_+_(42g)', 'sox19a_+_+_(44g)', 'sox19b_+_+_(44g)', 'sox2_+_+_(46g)', 'sox3_+_+_(58g)', 'tbx16_+_+_(239g)', 'tbx16l_+_+_(119g)', 'tbx1_+_+_(147g)', 'tbxta_+_+_(275g)', 'tbxtb_+_+_(221g)', 'tcf12_+_+_(365g)', 'tcf3b_+_+_(199g)', 'tcf7_+_+_(22g)', 'tcf7l1a_-_+_(35g)', 'tcf7l1b_-_+_(45g)', 'tcf7l2_+_+_(107g)', 'tead1b_+_+_(301g)', 'tead3a_+_+_(683g)', 'tead3b_+_+_(782g)', 'tfap2a_+_+_(545g)', 'tfap2c_+_+_(485g)', 'tfap2c_-_+_(73g)', 'ybx1_+_+_(45g)', 'zbtb18_+_+_(205g)', 'zic1_+_+_(41g)', 'zic2a_+_+_(140g)', 'zic2b_+_+_(118g)', 'zic3_+_+_(17g)']
(scplus_obj.uns['selected_eRegulons']['Gene_based']).to_csv(outDir + 'scenicplus/eRegulon_name.csv',sep='\t',index=False)
with open(outDir + 'scenicplus/eRegulon_name.csv', 'w') as fp:
for item in (scplus_obj.uns['selected_eRegulons']['Gene_based']):
# write each item on a new line
fp.write("%s\n" % item)
print('Done')
Done
scplus_obj.uns['RSS']['GEX_cell.type_gene_based'].to_csv(outDir + 'scenicplus/eRegulon_RSS.csv',sep='\t')